diff --git a/.github/workflows/rust-volume-server-tests.yml b/.github/workflows/rust-volume-server-tests.yml new file mode 100644 index 000000000..40a125764 --- /dev/null +++ b/.github/workflows/rust-volume-server-tests.yml @@ -0,0 +1,242 @@ +name: "Rust Volume Server Tests" + +on: + pull_request: + branches: [ master ] + paths: + - 'seaweed-volume/**' + - 'test/volume_server/**' + - 'weed/pb/volume_server.proto' + - 'weed/pb/volume_server_pb/**' + - '.github/workflows/rust-volume-server-tests.yml' + push: + branches: [ master, main ] + paths: + - 'seaweed-volume/**' + - 'test/volume_server/**' + - 'weed/pb/volume_server.proto' + - 'weed/pb/volume_server_pb/**' + - '.github/workflows/rust-volume-server-tests.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + + +jobs: + rust-unit-tests: + name: Rust Unit Tests + runs-on: ubuntu-22.04 + timeout-minutes: 15 + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust- + + - name: Build Rust volume server + run: cd seaweed-volume && cargo build --release + + - name: Run Rust unit tests + run: cd seaweed-volume && cargo test + + rust-integration-tests: + name: Rust Integration Tests + runs-on: ubuntu-22.04 + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: 'go.mod' + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust- + + - name: Build Go weed binary + run: | + cd weed + go build -o weed . + chmod +x weed + ./weed version + + - name: Build Rust volume binary + run: cd seaweed-volume && cargo build --release + + - name: Run integration tests + env: + WEED_BINARY: ${{ github.workspace }}/weed/weed + RUST_VOLUME_BINARY: ${{ github.workspace }}/seaweed-volume/target/release/weed-volume + run: | + echo "Running Rust volume server integration tests..." + go test -v -count=1 -timeout=15m ./test/volume_server/rust/... + + - name: Collect logs on failure + if: failure() + run: | + mkdir -p /tmp/rust-volume-server-it-logs + find /tmp -maxdepth 1 -type d -name "seaweedfs_volume_server_it_*" -print -exec cp -r {} /tmp/rust-volume-server-it-logs/ \; || true + + - name: Archive logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: rust-volume-server-integration-test-logs + path: /tmp/rust-volume-server-it-logs/ + if-no-files-found: warn + retention-days: 7 + + - name: Test summary + if: always() + run: | + echo "## Rust Volume Server Integration Test Summary" >> "$GITHUB_STEP_SUMMARY" + echo "- Suite: test/volume_server/rust" >> "$GITHUB_STEP_SUMMARY" + echo "- Command: go test -v -count=1 -timeout=15m ./test/volume_server/rust/..." >> "$GITHUB_STEP_SUMMARY" + + rust-volume-go-tests: + name: Go Tests with Rust Volume (${{ matrix.test-type }} - Shard ${{ matrix.shard }}) + runs-on: ubuntu-22.04 + timeout-minutes: 45 + strategy: + fail-fast: false + matrix: + test-type: [grpc, http] + shard: [1, 2, 3] + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: 'go.mod' + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust- + + - name: Build Go weed binary + run: | + cd weed + go build -o weed . + chmod +x weed + ./weed version + + - name: Build Rust volume binary + run: cd seaweed-volume && cargo build --release + + - name: Run volume server integration tests with Rust volume + env: + WEED_BINARY: ${{ github.workspace }}/weed/weed + RUST_VOLUME_BINARY: ${{ github.workspace }}/seaweed-volume/target/release/weed-volume + VOLUME_SERVER_IMPL: rust + run: | + if [ "${{ matrix.test-type }}" == "grpc" ]; then + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-H]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[I-S]" + else + TEST_PATTERN="^Test[T-Z]" + fi + else + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-G]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[H-R]" + else + TEST_PATTERN="^Test[S-Z]" + fi + fi + echo "Running Go volume server tests with Rust volume for ${{ matrix.test-type }} (Shard ${{ matrix.shard }}, pattern: ${TEST_PATTERN})..." + go test -v -count=1 -tags 5BytesOffset -timeout=30m ./test/volume_server/${{ matrix.test-type }}/... -run "${TEST_PATTERN}" + + - name: Collect logs on failure + if: failure() + run: | + mkdir -p /tmp/rust-volume-go-test-logs + find /tmp -maxdepth 1 -type d -name "seaweedfs_volume_server_it_*" -print -exec cp -r {} /tmp/rust-volume-go-test-logs/ \; || true + + - name: Archive logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: rust-volume-go-test-logs-${{ matrix.test-type }}-shard${{ matrix.shard }} + path: /tmp/rust-volume-go-test-logs/ + if-no-files-found: warn + retention-days: 7 + + - name: Test summary + if: always() + run: | + if [ "${{ matrix.test-type }}" == "grpc" ]; then + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-H]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[I-S]" + else + TEST_PATTERN="^Test[T-Z]" + fi + else + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-G]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[H-R]" + else + TEST_PATTERN="^Test[S-Z]" + fi + fi + echo "## Rust Volume - Go Test Summary (${{ matrix.test-type }} - Shard ${{ matrix.shard }})" >> "$GITHUB_STEP_SUMMARY" + echo "- Suite: test/volume_server/${{ matrix.test-type }} (Pattern: ${TEST_PATTERN})" >> "$GITHUB_STEP_SUMMARY" + echo "- Volume server: Rust (VOLUME_SERVER_IMPL=rust)" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/rust_binaries_dev.yml b/.github/workflows/rust_binaries_dev.yml new file mode 100644 index 000000000..cc81b93df --- /dev/null +++ b/.github/workflows/rust_binaries_dev.yml @@ -0,0 +1,165 @@ +name: "rust: build dev volume server binaries" + +on: + push: + branches: [ master ] + paths: + - 'seaweed-volume/**' + - '.github/workflows/rust_binaries_dev.yml' + +permissions: + contents: read + +jobs: + + cleanup: + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - name: Delete old Rust volume dev assets + uses: mknejp/delete-release-assets@v1 + with: + token: ${{ github.token }} + tag: dev + fail-if-no-assets: false + assets: | + weed-volume-* + + build-rust-volume-dev-linux: + permissions: + contents: write + needs: cleanup + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + asset_suffix: linux-amd64 + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-dev-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-dev-${{ matrix.target }}- + + - name: Set BUILD_TIME + run: echo BUILD_TIME=$(date -u +%Y%m%d-%H%M) >> "$GITHUB_ENV" + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release + + - name: Package large disk binary + run: | + cp seaweed-volume/target/release/weed-volume weed-volume-large-disk + tar czf "weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-large-disk + rm weed-volume-large-disk + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release --no-default-features + + - name: Package normal binary + run: | + cp seaweed-volume/target/release/weed-volume weed-volume-normal + tar czf "weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-normal + rm weed-volume-normal + + - name: Upload dev release assets + uses: softprops/action-gh-release@v2 + with: + tag_name: dev + prerelease: true + files: | + weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-rust-volume-dev-darwin: + permissions: + contents: write + needs: build-rust-volume-dev-linux + runs-on: macos-latest + strategy: + matrix: + include: + - target: aarch64-apple-darwin + asset_suffix: darwin-arm64 + - target: x86_64-apple-darwin + asset_suffix: darwin-amd64 + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: brew install protobuf + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-dev-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-dev-${{ matrix.target }}- + + - name: Set BUILD_TIME + run: echo BUILD_TIME=$(date -u +%Y%m%d-%H%M) >> "$GITHUB_ENV" + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release --target ${{ matrix.target }} + + - name: Package large disk binary + run: | + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-large-disk + tar czf "weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-large-disk + rm weed-volume-large-disk + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release --target ${{ matrix.target }} --no-default-features + + - name: Package normal binary + run: | + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-normal + tar czf "weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-normal + rm weed-volume-normal + + - name: Upload dev release assets + uses: softprops/action-gh-release@v2 + with: + tag_name: dev + prerelease: true + files: | + weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/rust_binaries_release.yml b/.github/workflows/rust_binaries_release.yml new file mode 100644 index 000000000..a7f91105f --- /dev/null +++ b/.github/workflows/rust_binaries_release.yml @@ -0,0 +1,215 @@ +name: "rust: build versioned volume server binaries" + +on: + push: + tags: + - '*' + + workflow_dispatch: + +permissions: + contents: read + +jobs: + + build-rust-volume-linux: + permissions: + contents: write + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + asset_suffix: linux_amd64 + - target: aarch64-unknown-linux-gnu + asset_suffix: linux_arm64 + cross: true + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Install cross-compilation tools + if: matrix.cross + run: | + sudo apt-get install -y gcc-aarch64-linux-gnu + echo "CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc" >> "$GITHUB_ENV" + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-release-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-release-${{ matrix.target }}- + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} --no-default-features + + - name: Package binaries + run: | + # Large disk (default, 5bytes feature) + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-large-disk + tar czf weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz weed-volume-large-disk + rm weed-volume-large-disk + + # Normal volume size + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-normal + tar czf weed-volume_${{ matrix.asset_suffix }}.tar.gz weed-volume-normal + rm weed-volume-normal + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz + weed-volume_${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-rust-volume-darwin: + permissions: + contents: write + runs-on: macos-latest + strategy: + matrix: + include: + - target: x86_64-apple-darwin + asset_suffix: darwin_amd64 + - target: aarch64-apple-darwin + asset_suffix: darwin_arm64 + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: brew install protobuf + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-release-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-release-${{ matrix.target }}- + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} --no-default-features + + - name: Package binaries + run: | + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-large-disk + tar czf weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz weed-volume-large-disk + rm weed-volume-large-disk + + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-normal + tar czf weed-volume_${{ matrix.asset_suffix }}.tar.gz weed-volume-normal + rm weed-volume-normal + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz + weed-volume_${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-rust-volume-windows: + permissions: + contents: write + runs-on: windows-latest + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: choco install protoc -y + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-release-windows-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-release-windows- + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --no-default-features + + - name: Package binaries + shell: bash + run: | + cp seaweed-volume/target/release/weed-volume.exe weed-volume-large-disk.exe + 7z a weed-volume_large_disk_windows_amd64.zip weed-volume-large-disk.exe + rm weed-volume-large-disk.exe + + cp seaweed-volume/target/release/weed-volume.exe weed-volume-normal.exe + 7z a weed-volume_windows_amd64.zip weed-volume-normal.exe + rm weed-volume-normal.exe + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + weed-volume_large_disk_windows_amd64.zip + weed-volume_windows_amd64.zip + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index a3ea87971..b356654f9 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,4 @@ test/s3/iam/.test_env weed_bin telemetry/server/telemetry-server .aider* +/seaweed-volume/docs diff --git a/VOLUME_SERVER_RUST_PLAN.md b/VOLUME_SERVER_RUST_PLAN.md new file mode 100644 index 000000000..1c402336f --- /dev/null +++ b/VOLUME_SERVER_RUST_PLAN.md @@ -0,0 +1,790 @@ +# Execution Plan: SeaweedFS Volume Server — Go to Rust Port + +## Scope Summary + +| Component | Go Source | Lines (non-test) | Description | +|---|---|---|---| +| CLI & startup | `weed/command/volume.go` | 476 | ~40 CLI flags, server bootstrap | +| HTTP server + handlers | `weed/server/volume_server*.go` | 1,517 | Struct, routes, read/write/delete handlers | +| gRPC handlers | `weed/server/volume_grpc_*.go` | 3,073 | 40 RPC method implementations | +| Storage engine | `weed/storage/` | 15,271 | Volumes, needles, index, compaction, EC, backend | +| Protobuf definitions | `weed/pb/volume_server.proto` | 759 | Service + message definitions | +| Shared utilities | `weed/security/`, `weed/stats/`, `weed/util/` | ~2,000+ | JWT, TLS, metrics, helpers | +| **Total** | | **~23,000+** | | + +## Rust Crate & Dependency Strategy + +``` +seaweed-volume/ +├── Cargo.toml +├── build.rs # protobuf codegen +├── proto/ +│ ├── volume_server.proto # copied from Go, adapted +│ └── remote.proto +├── src/ +│ ├── main.rs # CLI entry point +│ ├── config.rs # CLI flags + config +│ ├── server/ +│ │ ├── mod.rs +│ │ ├── volume_server.rs # VolumeServer struct + lifecycle +│ │ ├── http_handlers.rs # HTTP route dispatch +│ │ ├── http_read.rs # GET/HEAD handlers +│ │ ├── http_write.rs # POST/PUT handlers +│ │ ├── http_delete.rs # DELETE handler +│ │ ├── http_admin.rs # /status, /healthz, /ui +│ │ ├── grpc_service.rs # gRPC trait impl dispatch +│ │ ├── grpc_vacuum.rs +│ │ ├── grpc_copy.rs +│ │ ├── grpc_erasure_coding.rs +│ │ ├── grpc_tail.rs +│ │ ├── grpc_admin.rs +│ │ ├── grpc_read_write.rs +│ │ ├── grpc_batch_delete.rs +│ │ ├── grpc_scrub.rs +│ │ ├── grpc_tier.rs +│ │ ├── grpc_remote.rs +│ │ ├── grpc_query.rs +│ │ ├── grpc_state.rs +│ │ └── grpc_client_to_master.rs # heartbeat +│ ├── storage/ +│ │ ├── mod.rs +│ │ ├── store.rs # Store (multi-disk manager) +│ │ ├── volume.rs # Volume struct + lifecycle +│ │ ├── volume_read.rs +│ │ ├── volume_write.rs +│ │ ├── volume_compact.rs +│ │ ├── volume_info.rs +│ │ ├── needle/ +│ │ │ ├── mod.rs +│ │ │ ├── needle.rs # Needle struct + serialization +│ │ │ ├── needle_read.rs +│ │ │ ├── needle_write.rs +│ │ │ ├── needle_map.rs # in-memory NeedleMap +│ │ │ ├── needle_value.rs +│ │ │ └── crc.rs +│ │ ├── super_block.rs +│ │ ├── idx/ +│ │ │ ├── mod.rs +│ │ │ └── idx.rs # .idx file format read/write +│ │ ├── needle_map_leveldb.rs +│ │ ├── types.rs # NeedleId, Offset, Size, DiskType +│ │ ├── disk_location.rs # DiskLocation per-directory +│ │ ├── erasure_coding/ +│ │ │ ├── mod.rs +│ │ │ ├── ec_volume.rs +│ │ │ ├── ec_shard.rs +│ │ │ ├── ec_encoder.rs # Reed-Solomon encoding +│ │ │ └── ec_decoder.rs +│ │ └── backend/ +│ │ ├── mod.rs +│ │ ├── disk.rs +│ │ └── s3_backend.rs # tiered storage to S3 +│ ├── topology/ +│ │ └── volume_layout.rs # replication placement +│ ├── security/ +│ │ ├── mod.rs +│ │ ├── guard.rs # whitelist + JWT gate +│ │ ├── jwt.rs +│ │ └── tls.rs +│ ├── stats/ +│ │ ├── mod.rs +│ │ └── metrics.rs # Prometheus counters/gauges +│ └── util/ +│ ├── mod.rs +│ ├── grpc.rs +│ ├── http.rs +│ └── file.rs +└── tests/ + ├── integration/ + │ ├── http_read_test.rs + │ ├── http_write_test.rs + │ ├── grpc_test.rs + │ └── storage_test.rs + └── unit/ + ├── needle_test.rs + ├── idx_test.rs + ├── super_block_test.rs + └── ec_test.rs +``` + +### Key Rust dependencies + +| Purpose | Crate | +|---|---| +| Async runtime | `tokio` | +| gRPC | `tonic` + `prost` | +| HTTP server | `hyper` + `axum` | +| CLI parsing | `clap` (derive) | +| Prometheus metrics | `prometheus` | +| JWT | `jsonwebtoken` | +| TLS | `rustls` + `tokio-rustls` | +| LevelDB | `rusty-leveldb` or `rocksdb` | +| Reed-Solomon EC | `reed-solomon-erasure` | +| Logging | `tracing` + `tracing-subscriber` | +| Config (security.toml) | `toml` + `serde` | +| CRC32 | `crc32fast` | +| Memory-mapped files | `memmap2` | + +--- + +## Phased Execution Plan + +### Phase 1: Project Skeleton & Protobuf Codegen +**Goal:** Cargo project compiles, proto codegen works, CLI parses all flags. + +**Steps:** + +1.1. Create `seaweed-volume/Cargo.toml` with all dependencies listed above. + +1.2. Copy `volume_server.proto` and `remote.proto` into `proto/`. Adjust package paths for Rust codegen. + +1.3. Create `build.rs` using `tonic-build` to compile `.proto` files into Rust types. + +1.4. Create `src/main.rs` with `clap` derive structs mirroring all 40 CLI flags from `weed/command/volume.go`: + - `--port` (default 8080) + - `--port.grpc` (default 0 → 10000+port) + - `--port.public` (default 0 → same as port) + - `--ip` (auto-detect) + - `--id` (default empty → ip:port) + - `--publicUrl` + - `--ip.bind` + - `--master` (default "localhost:9333") + - `--mserver` (deprecated compat) + - `--preStopSeconds` (default 10) + - `--idleTimeout` (default 30) + - `--dataCenter` + - `--rack` + - `--index` [memory|leveldb|leveldbMedium|leveldbLarge] + - `--disk` [hdd|ssd|] + - `--tags` + - `--dir` (default temp dir) + - `--dir.idx` + - `--max` (default "8") + - `--whiteList` + - `--minFreeSpacePercent` (default "1") + - `--minFreeSpace` + - `--images.fix.orientation` (default false) + - `--readMode` [local|proxy|redirect] (default "proxy") + - `--cpuprofile` + - `--memprofile` + - `--compactionMBps` (default 0) + - `--maintenanceMBps` (default 0) + - `--fileSizeLimitMB` (default 256) + - `--concurrentUploadLimitMB` (default 0) + - `--concurrentDownloadLimitMB` (default 0) + - `--pprof` (default false) + - `--metricsPort` (default 0) + - `--metricsIp` + - `--inflightUploadDataTimeout` (default 60s) + - `--inflightDownloadDataTimeout` (default 60s) + - `--hasSlowRead` (default true) + - `--readBufferSizeMB` (default 4) + - `--index.leveldbTimeout` (default 0) + - `--debug` (default false) + - `--debug.port` (default 6060) + +1.5. Implement the same flag validation logic from `startVolumeServer()`: + - Parse comma-separated `--dir`, `--max`, `--minFreeSpace`, `--disk`, `--tags` + - Replicate single-value-to-all-dirs expansion + - Validate count matches between dirs and limits + - `--mserver` backward compat + +1.6. **Test:** `cargo build` succeeds. `cargo run -- --help` shows all flags. Proto types generated. + +**Verification:** Run with `--port 8080 --dir /tmp --master localhost:9333` — should parse without error and print config. + +--- + +### Phase 2: Core Storage Types & On-Disk Format +**Goal:** Read and write the SeaweedFS needle/volume binary format bit-for-bit compatible with Go. + +**Source files to port:** +- `weed/storage/types/needle_types.go` → `src/storage/types.rs` +- `weed/storage/needle/needle.go` → `src/storage/needle/needle.rs` +- `weed/storage/needle/needle_read.go` → `src/storage/needle/needle_read.rs` +- `weed/storage/needle/needle_write.go` (partial) → `src/storage/needle/needle_write.rs` +- `weed/storage/needle/crc.go` → `src/storage/needle/crc.rs` +- `weed/storage/needle/needle_value_map.go` → `src/storage/needle/needle_value.rs` +- `weed/storage/super_block/super_block.go` → `src/storage/super_block.rs` +- `weed/storage/idx/` → `src/storage/idx/` + +**Steps:** + +2.1. **Fundamental types** (`types.rs`): + - `NeedleId` (u64), `Offset` (u32 or u64 depending on version), `Size` (i32, negative = deleted) + - `Cookie` (u32) + - `DiskType` enum (HDD, SSD, Custom) + - Version constants (Version1=1, Version2=2, Version3=3, CurrentVersion=3) + - Byte serialization matching Go's `binary.BigEndian` encoding + +2.2. **SuperBlock** (`super_block.rs`): + - 8-byte header: Version(1) + ReplicaPlacement(1) + TTL(2) + CompactRevision(2) + Reserved(2) + - `ReplicaPlacement` struct with same/diff rack/dc counts + - `TTL` struct with count + unit + - Read/write from first 8 bytes of `.dat` file + - Match exact byte layout from `super_block.go` + +2.3. **Needle binary format** (`needle.rs`, `needle_read.rs`): + - Version 2/3 header: Cookie(4) + NeedleId(8) + Size(4) + - Body: Data, Flags, Name, Mime, PairsSize, Pairs, LastModified, TTL, Checksum, AppendAtNs, Padding + - CRC32 checksum (matching Go's `crc32.ChecksumIEEE`) + - Padding to 8-byte alignment + - Read path: read header → compute body length → read body → verify CRC + +2.4. **Idx file format** (`idx/`): + - Fixed 16-byte records: NeedleId(8) + Offset(4) + Size(4) + - Sequential append-only file + - Walk/iterate all entries + - Binary search not used (loaded into memory map) + +2.5. **NeedleMap (in-memory)** (`needle_map.rs`): + - HashMap where NeedleValue = {Offset, Size} + - Load from `.idx` file on volume mount + - Support Get, Set, Delete operations + - Track file count, deleted count, deleted byte count + +2.6. **Tests:** + - Unit test: write a needle to bytes → read it back → verify fields match + - Unit test: write/read SuperBlock round-trip + - Unit test: write/read idx entries round-trip + - **Cross-compat test:** Use Go volume server to create a small volume with known data. Read it from Rust and verify all needles decoded correctly. (Keep test fixture `.dat`/`.idx` files in `tests/fixtures/`) + +--- + +### Phase 3: Volume Struct & Lifecycle +**Goal:** Mount, read from, write to, and unmount a volume. + +**Source files to port:** +- `weed/storage/volume.go` → `src/storage/volume.rs` +- `weed/storage/volume_read.go` → `src/storage/volume_read.rs` +- `weed/storage/volume_write.go` → `src/storage/volume_write.rs` +- `weed/storage/volume_loading.go` +- `weed/storage/volume_vacuum.go` → `src/storage/volume_compact.rs` +- `weed/storage/volume_info/volume_info.go` → `src/storage/volume_info.rs` +- `weed/storage/volume_super_block.go` + +**Steps:** + +3.1. **Volume struct** (`volume.rs`): + - Fields: Id, dir, dataFile, nm (NeedleMap), SuperBlock, readOnly, lastModifiedTs, lastCompactIndexOffset, lastCompactRevision + - `noWriteOrDelete` / `noWriteCanDelete` / `readOnly` state flags + - File handles for `.dat` file (read + append) + - Lock strategy: `RwLock` for concurrent reads, exclusive writes + +3.2. **Volume loading** — exact logic from `volume_loading.go`: + - Open `.dat` file, read SuperBlock from first 8 bytes + - Load `.idx` file into NeedleMap + - Handle `.vif` (VolumeInfo) JSON sidecar file + - Set volume state based on SuperBlock + VolumeInfo + +3.3. **Volume read** (`volume_read.rs`) — from `volume_read.go`: + - `ReadNeedle(needleId, cookie)`: lookup in NeedleMap → seek in .dat → read needle bytes → verify cookie + CRC → return data + - Handle deleted needles (Size < 0) + - `ReadNeedleBlob(offset, size)`: raw blob read + - `ReadNeedleMeta(needleId, offset, size)`: read metadata only + +3.4. **Volume write** (`volume_write.rs`) — from `volume_write.go`: + - `WriteNeedle(needle)`: serialize needle → append to .dat → update .idx → update NeedleMap + - `DeleteNeedle(needleId)`: mark as deleted in NeedleMap + append tombstone to .idx + - File size limit check + - Concurrent write serialization (mutex on write path) + +3.5. **Volume compaction** (`volume_compact.rs`) — from `volume_vacuum.go`: + - `CheckCompact()`: compute garbage ratio + - `Compact()`: create new .dat/.idx, copy only live needles, update compact revision + - `CommitCompact()`: rename compacted files over originals + - `CleanupCompact()`: remove temp files + - Throttle by `compactionBytePerSecond` + +3.6. **Volume info** (`volume_info.rs`): + - Read/write `.vif` JSON sidecar + - VolumeInfo protobuf struct mapping + - Remote file references for tiered storage + +3.7. **Tests:** + - Mount a volume, write 100 needles, read them all back, verify content + - Delete 50 needles, verify they return "deleted" + - Compact, verify only 50 remain, verify content + - Read Go-created volume fixtures + +--- + +### Phase 4: Store (Multi-Volume, Multi-Disk Manager) +**Goal:** Manage multiple volumes across multiple disk directories. + +**Source files to port:** +- `weed/storage/store.go` → `src/storage/store.rs` +- `weed/storage/disk_location.go` → `src/storage/disk_location.rs` +- `weed/storage/store_ec.go` +- `weed/storage/store_state.go` + +**Steps:** + +4.1. **DiskLocation** (`disk_location.rs`): + - Directory path, max volume count, min free space, disk type, tags + - Load all volumes from directory on startup + - Track free space, check writable + +4.2. **Store** (`store.rs`): + - Vector of `DiskLocation`s + - `GetVolume(volumeId)` → lookup across all locations + - `HasVolume(volumeId)` check + - `AllocateVolume(...)` — create new volume in appropriate location + - `DeleteVolume(...)`, `MountVolume(...)`, `UnmountVolume(...)` + - `DeleteCollection(collection)` — delete all volumes of a collection + - Collect volume status for heartbeat + - `SetStopping()`, `Close()` + - Persistent state (maintenance mode) via `store_state.go` + +4.3. **Store state** — `VolumeServerState` protobuf with maintenance flag, persisted to disk. + +4.4. **Tests:** + - Create store with 2 dirs, allocate volumes in each, verify load balancing + - Mount/unmount/delete lifecycle + - State persistence across restart + +--- + +### Phase 5: Erasure Coding +**Goal:** Full EC shard encode/decode/read/write/rebuild. + +**Source files to port:** +- `weed/storage/erasure_coding/` (3,599 lines) + +**Steps:** + +5.1. **EC volume + shard structs** — `EcVolume`, `EcShard` with file handles for `.ec00`–`.ec13` shard files + `.ecx` index + `.ecj` journal. + +5.2. **EC encoder** — Reed-Solomon 10+4 (configurable) encoding using `reed-solomon-erasure` crate: + - `VolumeEcShardsGenerate`: read .dat → split into data shards → compute parity → write .ec00-.ec13 + .ecx + +5.3. **EC decoder/reader** — reconstruct data from any 10 of 14 shards: + - `EcShardRead`: read range from a specific shard + - Locate needle in EC volume via .ecx index + - Handle cross-shard needle reads + +5.4. **EC shard operations:** + - Copy, delete, mount, unmount shards + - `VolumeEcShardsRebuild`: rebuild missing shards from remaining + - `VolumeEcShardsToVolume`: reconstruct .dat from EC shards + - `VolumeEcBlobDelete`: mark deleted in EC journal + - `VolumeEcShardsInfo`: report shard metadata + +5.5. **Tests:** + - Encode a volume → verify 14 shards created + - Delete 4 shards → rebuild → verify data intact + - Read individual needles from EC volume + - Cross-compat with Go-generated EC shards + +--- + +### Phase 6: Backend / Tiered Storage +**Goal:** Support tiered storage to remote backends (S3, etc). + +**Source files to port:** +- `weed/storage/backend/` (1,850 lines) + +**Steps:** + +6.1. **Backend trait** — abstract `BackendStorage` trait with `ReadAt`, `WriteAt`, `Truncate`, `Close`, `Name`. + +6.2. **Disk backend** — default local disk implementation. + +6.3. **S3 backend** — upload .dat to S3, read ranges via S3 range requests. + +6.4. **Tier move operations:** + - `VolumeTierMoveDatToRemote`: upload .dat to remote, optionally delete local + - `VolumeTierMoveDatFromRemote`: download .dat from remote + +6.5. **Tests:** + - Disk backend read/write round-trip + - S3 backend with mock/localstack + +--- + +### Phase 7: Security Layer +**Goal:** JWT authentication, whitelist guard, TLS configuration. + +**Source files to port:** +- `weed/security/guard.go` → `src/security/guard.rs` +- `weed/security/jwt.go` → `src/security/jwt.rs` +- `weed/security/tls.go` → `src/security/tls.rs` + +**Steps:** + +7.1. **Guard** (`guard.rs`): + - Whitelist IP check (exact match on `r.RemoteAddr`) + - Wrap handlers with whitelist enforcement + - `UpdateWhiteList()` for live reload + +7.2. **JWT** (`jwt.rs`): + - `SeaweedFileIdClaims` with `fid` field + - Sign with HMAC-SHA256 + - Verify + decode with expiry check + - Separate signing keys for read vs write + - `GetJwt(request)` — extract from `Authorization: Bearer` header or `jwt` query param + +7.3. **TLS** (`tls.rs`): + - Load server TLS cert/key for gRPC and HTTPS + - Load client TLS for mutual TLS + - Read from `security.toml` config (same format as Go's viper config) + +7.4. **Tests:** + - JWT sign → verify round-trip + - JWT with wrong key → reject + - JWT with expired token → reject + - JWT fid mismatch → reject + - Whitelist allow/deny + +--- + +### Phase 8: Prometheus Metrics +**Goal:** Export same metric names as Go for dashboard compatibility. + +**Source files to port:** +- `weed/stats/metrics.go` (volume server counters/gauges/histograms) + +**Steps:** + +8.1. Define all Prometheus metrics matching Go names: + - `VolumeServerRequestCounter` (labels: method, status) + - `VolumeServerRequestHistogram` (labels: method) + - `VolumeServerInFlightRequestsGauge` (labels: method) + - `VolumeServerInFlightUploadSize` + - `VolumeServerInFlightDownloadSize` + - `VolumeServerConcurrentUploadLimit` + - `VolumeServerConcurrentDownloadLimit` + - `VolumeServerHandlerCounter` (labels: type — UploadLimitCond, DownloadLimitCond) + - Read/Write/Delete request counters + +8.2. Metrics HTTP endpoint on `--metricsPort`. + +8.3. Optional push-based metrics loop (`LoopPushingMetric`). + +8.4. **Test:** Verify metric names and labels match Go output. + +--- + +### Phase 9: HTTP Server & Handlers +**Goal:** All HTTP endpoints with exact same behavior as Go. + +**Source files to port:** +- `weed/server/volume_server.go` → `src/server/volume_server.rs` +- `weed/server/volume_server_handlers.go` → `src/server/http_handlers.rs` +- `weed/server/volume_server_handlers_read.go` → `src/server/http_read.rs` +- `weed/server/volume_server_handlers_write.go` → `src/server/http_write.rs` +- `weed/server/volume_server_handlers_admin.go` → `src/server/http_admin.rs` +- `weed/server/volume_server_handlers_helper.go` (URL parsing, proxy, JSON responses) +- `weed/server/volume_server_handlers_ui.go` → `src/server/http_admin.rs` + +**Steps:** + +9.1. **URL path parsing** — from `handlers_helper.go`: + - Parse `/,` and `//` patterns + - Extract volume ID, file ID, filename, ext + +9.2. **Route dispatch** — from `privateStoreHandler` and `publicReadOnlyHandler`: + - `GET /` → `GetOrHeadHandler` + - `HEAD /` → `GetOrHeadHandler` + - `POST /` → `PostHandler` (whitelist gated) + - `PUT /` → `PostHandler` (whitelist gated) + - `DELETE /` → `DeleteHandler` (whitelist gated) + - `OPTIONS /` → CORS preflight + - `GET /status` → JSON status + - `GET /healthz` → health check + - `GET /ui/index.html` → HTML UI page + - Static resources (CSS/JS for UI) + +9.3. **GET/HEAD handler** (`http_read.rs`) — from `handlers_read.go` (468 lines): + - JWT read authorization check + - Lookup needle by volume ID + needle ID + cookie + - ETag / If-None-Match / If-Modified-Since conditional responses + - Content-Type from stored MIME or filename extension + - Content-Disposition header + - Content-Encoding (gzip/zstd stored data) + - Range request support (HTTP 206 Partial Content) + - JPEG orientation fix (if configured) + - Proxy to replica on local miss (readMode=proxy) + - Redirect to replica (readMode=redirect) + - Download tracking (in-flight size accounting) + +9.4. **POST/PUT handler** (`http_write.rs`) — from `handlers_write.go` (170 lines): + - JWT write authorization check + - Multipart form parsing + - Extract file data, filename, content type, TTL, last-modified + - Optional gzip/zstd compression + - Write needle to volume + - Replicate to peers (same logic as Go's `DistributedOperation`) + - Return JSON: {name, size, eTag, error} + +9.5. **DELETE handler** — already in handlers.go: + - JWT authorization + - Delete from local volume + - Replicate delete to peers + - Return JSON result + +9.6. **Admin handlers** (`http_admin.rs`): + - `/status` → JSON with volumes, version, disk status + - `/healthz` → 200 OK if serving + - `/ui/index.html` → HTML dashboard + +9.7. **Concurrency limiting** — from `handlers.go`: + - Upload concurrency limit with `sync::Condvar` + timeout + - Download concurrency limit with proxy fallback to replicas + - HTTP 429 on timeout, 499 on client cancel + - Replication traffic bypasses upload limits + +9.8. **Public port** — if configured, separate listener with read-only routes (GET/HEAD/OPTIONS only). + +9.9. **Request ID middleware** — generate unique request ID per request. + +9.10. **Tests:** + - Integration: start server → upload file via POST → GET it back → verify content + - Integration: upload → DELETE → GET returns 404 + - Integration: conditional GET with ETag → 304 + - Integration: range request → 206 with correct bytes + - Integration: exceed upload limit → 429 + - Integration: whitelist enforcement + - Integration: JWT enforcement + +--- + +### Phase 10: gRPC Service Implementation +**Goal:** All 40 gRPC methods with exact logic. + +**Source files to port:** +- `weed/server/volume_grpc_admin.go` (380 lines) +- `weed/server/volume_grpc_vacuum.go` (124 lines) +- `weed/server/volume_grpc_copy.go` (636 lines) +- `weed/server/volume_grpc_copy_incremental.go` (66 lines) +- `weed/server/volume_grpc_read_write.go` (74 lines) +- `weed/server/volume_grpc_batch_delete.go` (124 lines) +- `weed/server/volume_grpc_tail.go` (140 lines) +- `weed/server/volume_grpc_erasure_coding.go` (619 lines) +- `weed/server/volume_grpc_scrub.go` (121 lines) +- `weed/server/volume_grpc_tier_upload.go` (98 lines) +- `weed/server/volume_grpc_tier_download.go` (85 lines) +- `weed/server/volume_grpc_remote.go` (95 lines) +- `weed/server/volume_grpc_query.go` (69 lines) +- `weed/server/volume_grpc_state.go` (26 lines) +- `weed/server/volume_grpc_read_all.go` (35 lines) +- `weed/server/volume_grpc_client_to_master.go` (325 lines) + +**Steps (grouped by functional area):** + +10.1. **Implement `tonic::Service` for `VolumeServer`** — the generated trait from proto. + +10.2. **Admin RPCs** (`grpc_admin.rs`): + - `AllocateVolume` — create volume on appropriate disk location + - `VolumeMount` / `VolumeUnmount` / `VolumeDelete` + - `VolumeMarkReadonly` / `VolumeMarkWritable` + - `VolumeConfigure` — change replication + - `VolumeStatus` — return read-only, size, file counts + - `VolumeServerStatus` — disk statuses, memory, version, DC, rack + - `VolumeServerLeave` — deregister from master + - `DeleteCollection` + - `VolumeNeedleStatus` — get needle metadata by ID + - `Ping` — latency measurement + - `GetState` / `SetState` — maintenance mode + +10.3. **Vacuum RPCs** (`grpc_vacuum.rs`): + - `VacuumVolumeCheck` — return garbage ratio + - `VacuumVolumeCompact` — stream progress (streaming response) + - `VacuumVolumeCommit` — finalize compaction + - `VacuumVolumeCleanup` — remove temp files + +10.4. **Copy RPCs** (`grpc_copy.rs`): + - `VolumeCopy` — stream .dat/.idx from source to create local copy + - `VolumeSyncStatus` — return sync metadata + - `VolumeIncrementalCopy` — stream .dat delta since timestamp (streaming) + - `CopyFile` — generic file copy by extension (streaming) + - `ReceiveFile` — receive streamed file (client streaming) + - `ReadVolumeFileStatus` — return file timestamps and sizes + +10.5. **Read/Write RPCs** (`grpc_read_write.rs`): + - `ReadNeedleBlob` — raw needle blob read + - `ReadNeedleMeta` — needle metadata + - `WriteNeedleBlob` — raw needle blob write + - `ReadAllNeedles` — stream all needles from volume(s) (streaming) + +10.6. **Batch delete** (`grpc_batch_delete.rs`): + - `BatchDelete` — delete multiple file IDs, return per-ID results + +10.7. **Tail RPCs** (`grpc_tail.rs`): + - `VolumeTailSender` — stream new needles since timestamp (streaming) + - `VolumeTailReceiver` — connect to another volume server and tail its changes + +10.8. **Erasure coding RPCs** (`grpc_erasure_coding.rs`): + - `VolumeEcShardsGenerate` — generate EC shards from volume + - `VolumeEcShardsRebuild` — rebuild missing shards + - `VolumeEcShardsCopy` — copy shards from another server + - `VolumeEcShardsDelete` — delete EC shards + - `VolumeEcShardsMount` / `VolumeEcShardsUnmount` + - `VolumeEcShardRead` — read from EC shard (streaming) + - `VolumeEcBlobDelete` — mark blob deleted in EC volume + - `VolumeEcShardsToVolume` — reconstruct volume from EC shards + - `VolumeEcShardsInfo` — return shard metadata + +10.9. **Scrub RPCs** (`grpc_scrub.rs`): + - `ScrubVolume` — integrity check volumes (INDEX / FULL / LOCAL modes) + - `ScrubEcVolume` — integrity check EC volumes + +10.10. **Tier RPCs** (`grpc_tier.rs`): + - `VolumeTierMoveDatToRemote` — upload to remote backend (streaming progress) + - `VolumeTierMoveDatFromRemote` — download from remote (streaming progress) + +10.11. **Remote storage** (`grpc_remote.rs`): + - `FetchAndWriteNeedle` — fetch from remote storage, write locally, replicate + +10.12. **Query** (`grpc_query.rs`): + - `Query` — experimental CSV/JSON/Parquet select on stored data (streaming) + +10.13. **Master heartbeat** (`grpc_client_to_master.rs`): + - `heartbeat()` background task — periodic gRPC stream to master + - Send: volume info, EC shard info, disk stats, has-no-space flags, deleted volumes + - Receive: volume size limit, leader address, metrics config + - Reconnect on failure with backoff + - `StopHeartbeat()` for graceful shutdown + +10.14. **Tests:** + - Integration test per RPC: call via tonic client → verify response + - Streaming RPCs: verify all chunks received + - Error cases: invalid volume ID, non-existent volume, etc. + - Heartbeat: mock master gRPC server, verify registration + +--- + +### Phase 11: Startup, Lifecycle & Graceful Shutdown +**Goal:** Full server startup matching Go's `runVolume()` and `startVolumeServer()`. + +**Steps:** + +11.1. **Startup sequence** (match `volume.go` exactly): + 1. Load security configuration from `security.toml` + 2. Start metrics server on metrics port + 3. Parse folder/max/minFreeSpace/diskType/tags + 4. Validate all directory writable + 5. Resolve IP, bind IP, public URL, gRPC port + 6. Create `VolumeServer` struct + 7. Check with master (initial handshake) + 8. Create `Store` (loads all existing volumes from disk) + 9. Create security `Guard` + 10. Register HTTP routes on admin mux + 11. Optionally register public mux + 12. Start gRPC server on gRPC port + 13. Start public HTTP server (if separated) + 14. Start cluster HTTP server (with optional TLS) + 15. Start heartbeat background task + 16. Start metrics push loop + 17. Register SIGHUP handler for config reload + new volume loading + +11.2. **Graceful shutdown** (match Go exactly): + 1. On SIGINT/SIGTERM: + 2. Stop heartbeat (notify master we're leaving) + 3. Wait `preStopSeconds` + 4. Stop public HTTP server + 5. Stop cluster HTTP server + 6. Graceful stop gRPC server + 7. `volumeServer.Shutdown()` → `store.Close()` (flush all volumes) + +11.3. **Reload** (SIGHUP): + - Reload security config + - Update whitelist + - Load newly appeared volumes from disk + +11.4. **Tests:** + - Start server → send SIGTERM → verify clean shutdown + - Start server → SIGHUP → verify config reloaded + +--- + +### Phase 12: Integration & Cross-Compatibility Testing +**Goal:** Rust volume server is a drop-in replacement for Go volume server. + +**Steps:** + +12.1. **Binary compatibility tests:** + - Create volumes with Go volume server + - Start Rust volume server on same data directory + - Read all data → verify identical + - Write new data with Rust → read with Go → verify + +12.2. **API compatibility tests:** + - Run same HTTP requests against both Go and Rust servers + - Compare response bodies, headers, status codes + - Test all gRPC RPCs against both + +12.3. **Master interop test:** + - Start Go master server + - Register Rust volume server + - Verify heartbeat works + - Verify volume assignment works + - Upload via filer → stored on Rust volume server → read back + +12.4. **Performance benchmarks:** + - Throughput: sequential writes, sequential reads + - Latency: p50/p99 for read/write + - Concurrency: parallel reads/writes + - Compare Rust vs Go numbers + +12.5. **Edge cases:** + - Volume at max size + - Disk full handling + - Corrupt .dat file recovery + - Network partition during replication + - EC shard loss + rebuild + +--- + +## Execution Order & Dependencies + +``` +Phase 1 (Skeleton + CLI) ← no deps, start here + ↓ +Phase 2 (Storage types) ← needs Phase 1 (types used everywhere) + ↓ +Phase 3 (Volume struct) ← needs Phase 2 + ↓ +Phase 4 (Store manager) ← needs Phase 3 + ↓ +Phase 7 (Security) ← independent, can parallel with 3-4 +Phase 8 (Metrics) ← independent, can parallel with 3-4 + ↓ +Phase 9 (HTTP server) ← needs Phase 4 + 7 + 8 +Phase 10 (gRPC server) ← needs Phase 4 + 7 + 8 + ↓ +Phase 5 (Erasure coding) ← needs Phase 4, wire into Phase 10 +Phase 6 (Tiered storage) ← needs Phase 4, wire into Phase 10 + ↓ +Phase 11 (Startup + shutdown) ← needs Phase 9 + 10 + ↓ +Phase 12 (Integration tests) ← needs all above +``` + +## Estimated Scope + +| Phase | Estimated Rust Lines | Complexity | +|---|---|---| +| 1. Skeleton + CLI | ~400 | Low | +| 2. Storage types | ~2,000 | High (binary compat critical) | +| 3. Volume struct | ~2,500 | High | +| 4. Store manager | ~1,000 | Medium | +| 5. Erasure coding | ~3,000 | High | +| 6. Tiered storage | ~1,500 | Medium | +| 7. Security | ~500 | Medium | +| 8. Metrics | ~300 | Low | +| 9. HTTP server | ~2,000 | High | +| 10. gRPC server | ~3,500 | High | +| 11. Startup/shutdown | ~500 | Medium | +| 12. Integration tests | ~2,000 | Medium | +| **Total** | **~19,000** | | + +## Critical Invariants to Preserve + +1. **Binary format compatibility** — Rust must read/write `.dat`, `.idx`, `.vif`, `.ecX` files identically to Go. A single byte off = data loss. +2. **gRPC wire compatibility** — Same proto, same field semantics. Go master must talk to Rust volume server seamlessly. +3. **HTTP API compatibility** — Same URL patterns, same JSON response shapes, same headers, same status codes. +4. **Replication protocol** — Write replication between Go and Rust volume servers must work bidirectionally. +5. **Heartbeat protocol** — Rust volume server must register with Go master and maintain heartbeat. +6. **CRC32 algorithm** — Must use IEEE polynomial (same as Go's `crc32.ChecksumIEEE`). +7. **JWT compatibility** — Tokens signed by Go filer/master must be verifiable by Rust volume server and vice versa. diff --git a/docker/Dockerfile.go_build b/docker/Dockerfile.go_build index c1c9a523e..3b8e120ed 100644 --- a/docker/Dockerfile.go_build +++ b/docker/Dockerfile.go_build @@ -16,9 +16,31 @@ RUN cd /go/src/github.com/seaweedfs/seaweedfs/weed \ && export LDFLAGS="-X github.com/seaweedfs/seaweedfs/weed/util/version.COMMIT=$(git rev-parse --short HEAD)" \ && CGO_ENABLED=0 go install -tags "$TAGS" -ldflags "-extldflags -static ${LDFLAGS}" +# Rust volume server builder (amd64/arm64 only) +FROM rust:1-alpine as rust_builder +ARG TARGETARCH +RUN apk add musl-dev protobuf-dev git +COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/seaweed-volume /build/seaweed-volume +COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/proto /build/proto +WORKDIR /build/seaweed-volume +ARG TAGS +RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \ + if [ "$TAGS" = "5BytesOffset" ]; then \ + cargo build --release; \ + else \ + cargo build --release --no-default-features; \ + fi && \ + cp target/release/weed-volume /weed-volume; \ + else \ + echo "Skipping Rust build for $TARGETARCH (unsupported)" && \ + touch /weed-volume; \ + fi + FROM alpine AS final LABEL author="Chris Lu" COPY --from=builder /go/bin/weed /usr/bin/ +# Copy Rust volume server binary (real binary on amd64/arm64, empty placeholder on other platforms) +COPY --from=rust_builder /weed-volume /usr/bin/weed-volume RUN mkdir -p /etc/seaweedfs COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/docker/filer.toml /etc/seaweedfs/filer.toml COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/docker/entrypoint.sh /entrypoint.sh diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 7d8bd24f2..6632f6645 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -72,6 +72,20 @@ case "$1" in exec /usr/bin/weed -logtostderr=true volume $ARGS $@ ;; + 'volume-rust') + ARGS="-dir /data -max 0" + if isArgPassed "-max" "$@"; then + ARGS="-dir /data" + fi + shift + if [ ! -s /usr/bin/weed-volume ]; then + echo "Error: Rust volume server is not available on this platform ($(uname -m))." >&2 + echo "Use 'volume' for the Go volume server instead." >&2 + exit 1 + fi + exec /usr/bin/weed-volume $ARGS $@ + ;; + 'server') ARGS="-dir=/data -volume.max=0 -master.volumeSizeLimitMB=1024" if isArgPassed "-volume.max" "$@"; then diff --git a/install.sh b/install.sh new file mode 100755 index 000000000..86b45f165 --- /dev/null +++ b/install.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# +# SeaweedFS Installer +# Downloads Go and/or Rust binaries from GitHub releases. +# +# Usage: +# curl -fsSL https://raw.githubusercontent.com/seaweedfs/seaweedfs/master/install.sh | bash +# curl -fsSL ... | bash -s -- --component volume-rust --large-disk +# curl -fsSL ... | bash -s -- --version v3.93 --dir /usr/local/bin +# +# Options: +# --component COMP Which binary to install: weed, volume-rust, all (default: weed) +# --version VER Release version tag (default: latest) +# --large-disk Use large disk variant (5-byte offset, 8TB max volume) +# --dir DIR Installation directory (default: /usr/local/bin) +# --help Show this help message + +set -euo pipefail + +REPO="seaweedfs/seaweedfs" +COMPONENT="weed" +VERSION="" +LARGE_DISK=false +INSTALL_DIR="/usr/local/bin" + +# Colors (if terminal supports them) +if [ -t 1 ]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' GREEN='' YELLOW='' BLUE='' NC='' +fi + +info() { echo -e "${BLUE}[info]${NC} $*"; } +ok() { echo -e "${GREEN}[ok]${NC} $*"; } +warn() { echo -e "${YELLOW}[warn]${NC} $*"; } +error() { echo -e "${RED}[error]${NC} $*" >&2; exit 1; } + +usage() { + sed -n '/^# Usage:/,/^$/p' "$0" | sed 's/^# \?//' + exit 0 +} + +# Parse arguments +while [ $# -gt 0 ]; do + case "$1" in + --component) COMPONENT="$2"; shift 2 ;; + --version) VERSION="$2"; shift 2 ;; + --large-disk) LARGE_DISK=true; shift ;; + --dir) INSTALL_DIR="$2"; shift 2 ;; + --help|-h) usage ;; + *) error "Unknown option: $1. Use --help for usage." ;; + esac +done + +# Detect OS and architecture +detect_platform() { + local os arch + + case "$(uname -s)" in + Linux*) os="linux" ;; + Darwin*) os="darwin" ;; + MINGW*|MSYS*|CYGWIN*) os="windows" ;; + FreeBSD*) os="freebsd" ;; + *) error "Unsupported OS: $(uname -s)" ;; + esac + + case "$(uname -m)" in + x86_64|amd64) arch="amd64" ;; + aarch64|arm64) arch="arm64" ;; + armv7l|armv6l) arch="arm" ;; + *) error "Unsupported architecture: $(uname -m)" ;; + esac + + echo "${os}" "${arch}" +} + +# Get latest release tag from GitHub API +get_latest_version() { + local url="https://api.github.com/repos/${REPO}/releases/latest" + if command -v curl &>/dev/null; then + curl -fsSL "$url" | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": *"\([^"]*\)".*/\1/' + elif command -v wget &>/dev/null; then + wget -qO- "$url" | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": *"\([^"]*\)".*/\1/' + else + error "Neither curl nor wget found. Please install one." + fi +} + +# Download a file +download() { + local url="$1" dest="$2" + info "Downloading ${url}" + if command -v curl &>/dev/null; then + curl -fsSL -o "$dest" "$url" + elif command -v wget &>/dev/null; then + wget -qO "$dest" "$url" + fi +} + +# Build Go weed binary asset name +go_asset_name() { + local os="$1" arch="$2" + local suffix="${os}_${arch}" + if [ "$LARGE_DISK" = true ]; then + suffix="${suffix}_large_disk" + fi + echo "${suffix}.tar.gz" +} + +# Build Rust volume server asset name +rust_asset_name() { + local os="$1" arch="$2" + local prefix="weed-volume" + if [ "$LARGE_DISK" = true ]; then + prefix="weed-volume_large_disk" + else + prefix="weed-volume" + fi + local suffix="${os}_${arch}" + if [ "$os" = "windows" ]; then + echo "${prefix}_${suffix}.zip" + else + echo "${prefix}_${suffix}.tar.gz" + fi +} + +# Install a single component +install_component() { + local component="$1" os="$2" arch="$3" + local asset_name download_url tmpdir + + tmpdir="$(mktemp -d)" + trap "rm -rf '$tmpdir'" EXIT + + case "$component" in + weed) + asset_name="$(go_asset_name "$os" "$arch")" + download_url="https://github.com/${REPO}/releases/download/${VERSION}/${asset_name}" + download "$download_url" "${tmpdir}/${asset_name}" + + info "Extracting ${asset_name}..." + tar xzf "${tmpdir}/${asset_name}" -C "$tmpdir" + + # The Go release action puts the binary inside a directory + local weed_bin + weed_bin="$(find "$tmpdir" -name 'weed' -type f | head -1)" + if [ -z "$weed_bin" ]; then + weed_bin="$(find "$tmpdir" -name 'weed.exe' -type f | head -1)" + fi + if [ -z "$weed_bin" ]; then + error "Could not find weed binary in archive" + fi + + chmod +x "$weed_bin" + install_binary "$weed_bin" "weed" + ok "Installed weed to ${INSTALL_DIR}/weed" + ;; + + volume-rust) + # Check platform support for Rust volume server + case "$os" in + linux|darwin|windows) ;; + *) error "Rust volume server is not available for ${os}. Supported: linux, darwin, windows" ;; + esac + case "$arch" in + amd64|arm64) ;; + *) error "Rust volume server is not available for ${arch}. Supported: amd64, arm64" ;; + esac + + asset_name="$(rust_asset_name "$os" "$arch")" + download_url="https://github.com/${REPO}/releases/download/${VERSION}/${asset_name}" + download "$download_url" "${tmpdir}/${asset_name}" + + info "Extracting ${asset_name}..." + if [ "$os" = "windows" ]; then + unzip -q "${tmpdir}/${asset_name}" -d "$tmpdir" + else + tar xzf "${tmpdir}/${asset_name}" -C "$tmpdir" + fi + + local rust_bin + if [ "$LARGE_DISK" = true ]; then + rust_bin="$(find "$tmpdir" -name 'weed-volume-large-disk*' -type f | head -1)" + else + rust_bin="$(find "$tmpdir" -name 'weed-volume-normal*' -type f | head -1)" + fi + if [ -z "$rust_bin" ]; then + rust_bin="$(find "$tmpdir" -name 'weed-volume*' -type f | head -1)" + fi + if [ -z "$rust_bin" ]; then + error "Could not find weed-volume binary in archive" + fi + + chmod +x "$rust_bin" + local dest_name="weed-volume" + if [ "$os" = "windows" ]; then + dest_name="weed-volume.exe" + fi + install_binary "$rust_bin" "$dest_name" + ok "Installed weed-volume to ${INSTALL_DIR}/${dest_name}" + ;; + + *) + error "Unknown component: ${component}. Use: weed, volume-rust, all" + ;; + esac +} + +# Copy binary to install dir, using sudo if needed +install_binary() { + local src="$1" name="$2" + local dest="${INSTALL_DIR}/${name}" + + mkdir -p "$INSTALL_DIR" 2>/dev/null || true + + if [ -w "$INSTALL_DIR" ]; then + cp "$src" "$dest" + else + info "Need elevated permissions to write to ${INSTALL_DIR}" + sudo cp "$src" "$dest" + fi + chmod +x "$dest" 2>/dev/null || sudo chmod +x "$dest" +} + +main() { + info "SeaweedFS Installer" + + read -r os arch <<< "$(detect_platform)" + info "Detected platform: ${os}/${arch}" + + if [ -z "$VERSION" ]; then + info "Resolving latest release..." + VERSION="$(get_latest_version)" + if [ -z "$VERSION" ]; then + error "Could not determine latest version. Specify with --version" + fi + fi + info "Version: ${VERSION}" + + if [ "$LARGE_DISK" = true ]; then + info "Variant: large disk (8TB max volume)" + else + info "Variant: normal (32GB max volume)" + fi + + case "$COMPONENT" in + all) + install_component "weed" "$os" "$arch" + install_component "volume-rust" "$os" "$arch" + ;; + *) + install_component "$COMPONENT" "$os" "$arch" + ;; + esac + + echo "" + ok "Installation complete!" + if [ "$COMPONENT" = "weed" ] || [ "$COMPONENT" = "all" ]; then + info " weed: ${INSTALL_DIR}/weed" + fi + if [ "$COMPONENT" = "volume-rust" ] || [ "$COMPONENT" = "all" ]; then + info " weed-volume: ${INSTALL_DIR}/weed-volume" + fi + echo "" + info "Quick start:" + info " weed master # Start master server" + info " weed volume -mserver=localhost:9333 # Start Go volume server" + info " weed-volume -mserver localhost:9333 # Start Rust volume server" +} + +main diff --git a/seaweed-volume/Cargo.lock b/seaweed-volume/Cargo.lock new file mode 100644 index 000000000..b5401c9a5 --- /dev/null +++ b/seaweed-volume/Cargo.lock @@ -0,0 +1,5255 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "asn1-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" +dependencies = [ + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc", + "nom", + "num-traits", + "rusticata-macros", + "thiserror 1.0.69", + "time", +] + +[[package]] +name = "asn1-rs-derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-config" +version = "1.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 1.4.0", + "sha1", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-lc-rs" +version = "1.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "aws-runtime" +version = "1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "bytes-utils", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.125.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "223f5c95650d9557925a91f4c2db3def189e8f659452134a29e5cd2d37d708ed" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "http-body 1.0.1", + "lru", + "percent-encoding", + "regex-lite", + "sha2", + "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.96.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f64a6eded248c6b453966e915d32aeddb48ea63ad17932682774eb026fbef5b1" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.98.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db96d720d3c622fcbe08bae1c4b04a72ce6257d8b0584cb5418da00ae20a344f" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.100.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fafbdda43b93f57f699c5dfe8328db590b967b8a820a13ccdd6687355dfcc7ca" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" +dependencies = [ + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "crypto-bigint 0.5.5", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "p256 0.11.1", + "percent-encoding", + "ring", + "sha2", + "subtle", + "time", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-checksums" +version = "0.64.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6750f3dd509b0694a4377f0293ed2f9630d710b1cebe281fa8bac8f099f88bc6" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc-fast", + "hex", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "md-5", + "pin-project-lite", + "sha1", + "sha2", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf09d74e5e32f76b8762da505a3cd59303e367a664ca67295387baa8c1d7548" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + +[[package]] +name = "aws-smithy-http" +version = "0.63.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" +dependencies = [ + "aws-smithy-eventstream", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a2f165a7feee6f263028b899d0a181987f4fa7179a6411a32a439fba7c5f769" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2", + "http 1.4.0", + "hyper", + "hyper-rustls", + "hyper-util", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower 0.5.3", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.62.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9648b0bb82a2eedd844052c6ad2a1a822d1f8e3adee5fbf668366717e428856a" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b1117b3b2bbe166d11199b540ceed0d0f7676e36e7b962b5a437a9971eac75" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", + "tokio", + "tokio-util", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "multer", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower 0.5.3", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpp_demangle" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0667304c32ea56cb4cd6d2d7c0cfe9a2f8041229db8c033af7f8d69492429def" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc-fast" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" +dependencies = [ + "crc", + "digest", + "rustversion", + "spin 0.10.0", +] + +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.12", +] + +[[package]] +name = "data-encoding" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" + +[[package]] +name = "debugid" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +dependencies = [ + "uuid", +] + +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "der-parser" +version = "9.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" +dependencies = [ + "asn1-rs", + "displaydoc", + "nom", + "num-bigint", + "num-traits", + "rusticata-macros", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der 0.6.1", + "elliptic-curve 0.12.3", + "rfc6979 0.3.1", + "signature 1.6.4", +] + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der 0.7.10", + "digest", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", +] + +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8 0.10.2", + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "subtle", + "zeroize", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct 0.1.1", + "crypto-bigint 0.4.9", + "der 0.6.1", + "digest", + "ff 0.12.1", + "generic-array", + "group 0.12.1", + "pkcs8 0.9.0", + "rand_core 0.6.4", + "sec1 0.3.0", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest", + "ff 0.13.1", + "generic-array", + "group 0.13.0", + "hkdf", + "pem-rfc7468", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "sec1 0.7.3", + "subtle", + "zeroize", +] + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" +dependencies = [ + "typenum", + "version_check", + "zeroize", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "gif" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5df2ba84018d80c213569363bdcd0c64e6933c67fe4c1d60ecf822971a3c35e" +dependencies = [ + "color_quant", + "weezl", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.4.0", + "indexmap 2.13.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2 0.6.3", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core 0.62.2", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "image" +version = "0.25.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "jsonwebtoken" +version = "10.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0529410abe238729a60b108898784df8984c87f6054c9c4fcacc47e4803c1ce1" +dependencies = [ + "base64", + "ed25519-dalek", + "getrandom 0.2.17", + "hmac", + "js-sys", + "p256 0.13.2", + "p384", + "pem", + "rand 0.8.5", + "rsa", + "serde", + "serde_json", + "sha2", + "signature 2.2.0", + "simple_asn1", +] + +[[package]] +name = "kamadak-exif" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef4fc70d0ab7e5b6bafa30216a6b48705ea964cdfc29c050f2412295eba58077" +dependencies = [ + "mutate_once", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin 0.9.8", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown 0.16.1", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "moxcms" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http 1.4.0", + "httparse", + "memchr", + "mime", + "spin 0.9.8", + "version_check", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "mutate_once" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af" + +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" +dependencies = [ + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-conv" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "oid-registry" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" +dependencies = [ + "asn1-rs", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa 0.14.8", + "elliptic-curve 0.12.3", + "sha2", +] + +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe42f1670a52a47d448f14b6a5c61dd78fce51856e68edaa38f7ae3a46b8d6b6" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.6", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.12", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall 0.2.16", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.18", + "smallvec", + "windows-link", +] + +[[package]] +name = "pem" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" +dependencies = [ + "base64", + "serde_core", +] + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset 0.4.2", + "indexmap 2.13.0", +] + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset 0.5.7", + "indexmap 2.13.0", +] + +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der 0.7.10", + "pkcs8 0.10.2", + "spki 0.7.3", +] + +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der 0.6.1", + "spki 0.6.0", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der 0.7.10", + "spki 0.7.3", +] + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags 2.11.0", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "pprof" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38a01da47675efa7673b032bf8efd8214f1917d89685e07e395ab125ea42b187" +dependencies = [ + "aligned-vec", + "backtrace", + "cfg-if", + "findshlibs", + "libc", + "log", + "nix", + "once_cell", + "prost 0.12.6", + "prost-build 0.12.6", + "prost-derive 0.12.6", + "sha2", + "smallvec", + "spin 0.10.0", + "symbolic-demangle", + "tempfile", + "thiserror 2.0.18", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve 0.13.8", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.11.0", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.44", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.11.0", + "hex", +] + +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "libc", + "memchr", + "parking_lot 0.12.5", + "procfs", + "thiserror 1.0.69", +] + +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive 0.12.6", +] + +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive 0.13.5", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools 0.12.1", + "log", + "multimap", + "once_cell", + "petgraph 0.6.5", + "prettyplease", + "prost 0.12.6", + "prost-types 0.12.6", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-build" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +dependencies = [ + "heck", + "itertools 0.14.0", + "log", + "multimap", + "once_cell", + "petgraph 0.7.1", + "prettyplease", + "prost 0.13.5", + "prost-types 0.13.5", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools 0.12.1", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost 0.12.6", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost 0.13.5", +] + +[[package]] +name = "pxfm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2 0.6.3", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2 0.6.3", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redb" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae323eb086579a3769daa2c753bb96deb95993c534711e0dbe881b5192906a06" +dependencies = [ + "libc", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.11.0", +] + +[[package]] +name = "reed-solomon-erasure" +version = "6.0.0" +dependencies = [ + "libm", + "lru", + "parking_lot 0.11.2", + "smallvec", + "spin 0.9.8", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "mime_guess", + "native-tls", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-rustls", + "tokio-util", + "tower 0.5.3", + "tower-http 0.6.8", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "rfc6979" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rsa" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "signature 2.2.0", + "spki 0.7.3", + "subtle", + "zeroize", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rusticata-macros" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom", +] + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags 2.11.0", + "errno 0.3.14", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags 2.11.0", + "errno 0.3.14", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "rusty-leveldb" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48d2f060dd1286adc9c3d179cb5af1292a9d2fcf291abcfe056023fc1977b44" +dependencies = [ + "crc", + "errno 0.2.8", + "fs2", + "integer-encoding", + "rand 0.8.5", + "snap", +] + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct 0.1.1", + "der 0.6.1", + "generic-array", + "pkcs8 0.9.0", + "subtle", + "zeroize", +] + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct 0.2.0", + "der 0.7.10", + "generic-array", + "pkcs8 0.10.2", + "subtle", + "zeroize", +] + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags 2.11.0", + "core-foundation 0.10.1", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno 0.3.14", + "libc", +] + +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simple_asn1" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror 2.0.18", + "time", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der 0.6.1", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.10", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "symbolic-common" +version = "12.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "751a2823d606b5d0a7616499e4130a516ebd01a44f39811be2b9600936509c23" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b237cfbe320601dd24b4ac817a5b68bb28f5508e33f08d42be0682cadc8ac9" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sysinfo" +version = "0.31.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355dbe4f8799b304b05e1b0f05fc59b2a18d36645cf169607da45bde2f69a1be" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "rayon", + "windows", +] + +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags 2.11.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +dependencies = [ + "bytes", + "libc", + "mio", + "parking_lot 0.12.5", + "pin-project-lite", + "signal-hook-registry", + "socket2 0.6.3", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-io-timeout" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bd86198d9ee903fedd2f9a2e72014287c0d9167e4ae43b5853007205dda1b76" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap 2.13.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost 0.13.5", + "rustls-pemfile", + "socket2 0.5.10", + "tokio", + "tokio-rustls", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build 0.13.5", + "prost-types 0.13.5", + "quote", + "syn", +] + +[[package]] +name = "tonic-reflection" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "878d81f52e7fcfd80026b7fdb6a9b578b3c3653ba987f87f0dce4b64043cba27" +dependencies = [ + "prost 0.13.5", + "prost-types 0.13.5", + "tokio", + "tokio-stream", + "tonic", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand 0.8.5", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" +dependencies = [ + "bitflags 2.11.0", + "bytes", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags 2.11.0", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "iri-string", + "pin-project-lite", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.0", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.11.0", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "weed-volume" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-stream", + "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-s3", + "aws-types", + "axum", + "base64", + "bytes", + "chrono", + "clap", + "crc32c", + "crc32fast", + "dashmap", + "flate2", + "futures", + "hex", + "http-body 1.0.1", + "hyper", + "hyper-util", + "image", + "jsonwebtoken", + "kamadak-exif", + "lazy_static", + "libc", + "md-5", + "memmap2", + "mime_guess", + "multer", + "parking_lot 0.12.5", + "pprof", + "prometheus", + "prost 0.13.5", + "prost-types 0.13.5", + "rand 0.8.5", + "redb", + "reed-solomon-erasure", + "reqwest", + "rustls", + "rustls-pemfile", + "rusty-leveldb", + "serde", + "serde_json", + "serde_urlencoded", + "sysinfo", + "tempfile", + "thiserror 1.0.69", + "tokio", + "tokio-io-timeout", + "tokio-rustls", + "tokio-stream", + "toml", + "tonic", + "tonic-build", + "tonic-reflection", + "tower 0.4.13", + "tower-http 0.5.2", + "tracing", + "tracing-subscriber", + "uuid", + "x509-parser", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement 0.57.0", + "windows-interface 0.57.0", + "windows-result 0.1.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement 0.60.2", + "windows-interface 0.59.3", + "windows-link", + "windows-result 0.4.1", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result 0.4.1", + "windows-strings", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.0", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.11.0", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "x509-parser" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" +dependencies = [ + "asn1-rs", + "data-encoding", + "der-parser", + "lazy_static", + "nom", + "oid-registry", + "rusticata-macros", + "thiserror 1.0.69", + "time", +] + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-jpeg" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe" +dependencies = [ + "zune-core", +] diff --git a/seaweed-volume/Cargo.toml b/seaweed-volume/Cargo.toml new file mode 100644 index 000000000..6d77586a9 --- /dev/null +++ b/seaweed-volume/Cargo.toml @@ -0,0 +1,137 @@ +[package] +name = "weed-volume" +version = "0.1.0" +edition = "2021" +description = "SeaweedFS Volume Server — Rust implementation" + +[lib] +name = "seaweed_volume" + +[[bin]] +name = "weed-volume" +path = "src/main.rs" + +[features] +# Default: 5-byte offsets (8TB max volume size), matching production Go builds (-tags 5BytesOffset). +# Disable with --no-default-features for 4-byte offsets (32GB max volume size). +default = ["5bytes"] +5bytes = [] + +[dependencies] +# Async runtime +tokio = { version = "1", features = ["full"] } +tokio-stream = "0.1" +tokio-io-timeout = "1" + +# gRPC + protobuf +tonic = { version = "0.12", features = ["tls"] } +tonic-reflection = "0.12" +prost = "0.13" +prost-types = "0.13" + +# HTTP server +axum = { version = "0.7", features = ["multipart"] } +http-body = "1" +hyper = { version = "1", features = ["full"] } +hyper-util = { version = "0.1", features = ["tokio", "service", "server-auto", "http1", "http2"] } +tower = "0.4" +tower-http = { version = "0.5", features = ["cors", "trace"] } + +# CLI +clap = { version = "4", features = ["derive"] } + +# Metrics +prometheus = { version = "0.13", default-features = false, features = ["process"] } +lazy_static = "1" + +# JWT +jsonwebtoken = { version = "10", features = ["rust_crypto"] } + +# TLS +rustls = "0.23" +tokio-rustls = "0.26" +rustls-pemfile = "2" + +# LevelDB (via RocksDB for better Rust support) +# Using rusty-leveldb for pure Rust LevelDB +rusty-leveldb = "3" + +# Disk-backed needle map (alternative to in-memory HashMap) +redb = "3" + +# Reed-Solomon erasure coding +reed-solomon-erasure = "6" + +# Logging +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +pprof = { version = "0.15", features = ["prost-codec"] } + +# Config +toml = "0.8" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_urlencoded = "0.7" + +# CRC32 — using Castagnoli polynomial (CRC32-C), matching Go's crc32.Castagnoli +crc32c = "0.6" +crc32fast = "1" + +# Memory-mapped files +memmap2 = "0.9" + +# UUID +uuid = { version = "1", features = ["v4"] } + +# HTTP client (for proxying, remote fetch) +reqwest = { version = "0.12", features = ["rustls-tls", "stream", "multipart", "json"] } + +# Content hashing +md-5 = "0.10" +base64 = "0.22" + +# Compression +flate2 = "1" + +# Image processing +image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp"] } +kamadak-exif = "0.5" + +# Multipart form-data parsing +multer = "3" + +# MIME type guessing from file extensions +mime_guess = "2" + +# Misc +bytes = "1" +rand = "0.8" +chrono = "0.4" +hex = "0.4" +parking_lot = "0.12" +dashmap = "6" +thiserror = "1" +anyhow = "1" +async-trait = "0.1" +futures = "0.3" +async-stream = "0.3" +x509-parser = "0.16" + +# Disk space checking +sysinfo = "0.31" +libc = "0.2" + +# AWS S3 SDK (for remote storage backends) +aws-config = { version = "1", features = ["behavior-version-latest"] } +aws-sdk-s3 = { version = "1.125.0", default-features = false, features = ["sigv4a", "http-1x", "default-https-client", "rt-tokio"] } +aws-credential-types = "1" +aws-types = "1" + +[dev-dependencies] +tempfile = "3" + +[build-dependencies] +tonic-build = "0.12" + +[patch.crates-io] +reed-solomon-erasure = { path = "vendor/reed-solomon-erasure" } diff --git a/seaweed-volume/DEV_PLAN.md b/seaweed-volume/DEV_PLAN.md new file mode 100644 index 000000000..44b610538 --- /dev/null +++ b/seaweed-volume/DEV_PLAN.md @@ -0,0 +1,105 @@ +# Rust Volume Server — Dev Plan + +## Current Status (2026-03-07) + +**HTTP tests**: 53/53 pass (100%) +**gRPC tests**: 56/56 pass (100%) — includes TestVolumeMoveHandlesInFlightWrites with Rust multi-volume cluster +**Rust integration tests**: 8/8 pass +**S3 remote storage tests**: 3/3 pass +**Total**: 117/117 (100%) + 8 Rust + 3 S3 tests +**Rust unit tests**: 137 lib + 7 integration = 144 + +## Completed Features + +All phases from the original plan are complete: + +- **Phase 1** — HTTP Core: CORS, OPTIONS, unsupported methods, static assets, path routing, + cookie validation, conditional headers, range requests, dedup 204, content-encoding, + readDeleted, chunk manifests, multipart validation, MD5 check, file size limit, + upload/download throttling, image resize/crop, download disposition +- **Phase 2** — JWT/Security: signing keys from security.toml, token source precedence + (query > header > cookie), file_id claims, leeway=0 +- **Phase 3** — gRPC: maintenance mode, error message parity, ping routing, batch delete, + VolumeServerStatus (with real disk stats, data_center, rack), ReadVolumeFileStatus + (with timestamps) +- **Phase 4** — Streaming gRPC: VolumeIncrementalCopy, CopyFile, ReceiveFile, ReadAllNeedles, + VolumeTailSender, VolumeCopy, VolumeTailReceiver, VacuumVolumeCheck +- **Phase 5** — EC Shards: mount/unmount, delete, read, blob delete, rebuild, shards-to-volume, + copy, info +- **Phase 6** — Advanced gRPC: ScrubVolume, ScrubEcVolume, Query, FetchAndWriteNeedle, + VolumeTierMoveDat (error paths) +- **Phase 7** — Remote Storage: S3-compatible backend via aws-sdk-s3, + FetchAndWriteNeedle reads from S3/MinIO/SeaweedFS S3 and writes locally. + Supports all S3-compatible providers (AWS, Wasabi, Backblaze, Aliyun, etc.) +- **Master Heartbeat** — Bidirectional streaming SendHeartbeat RPC, volume/EC registration, + leader changes, shutdown deregistration. Tested end-to-end with Go master. +- **Production Sprint 1** — Quick wins: + - VolumeMarkReadonly master notification (triggers immediate heartbeat) + - Compaction throttling (`maybe_throttle_compaction()`) + - File size limit enforcement on upload + - `ts` query param for custom timestamps (upload + delete) + - TTL expiration check (was already implemented) + - Health check heartbeat status (returns 503 if disconnected from master) + - preStopSeconds graceful drain before shutdown + - S3 response passthrough headers (content-encoding, expires, content-language, content-disposition) + - .vif persistence for readonly state across restarts + - Webp image support for resize +- **Production Sprint 2** — Compatibility: + - MIME type extraction from Content-Type header + - Stats endpoints (/stats/counter, /stats/memory, /stats/disk) + - JSON pretty print (?pretty=y) and JSONP (?callback=fn) + - Request ID generation (UUID if x-amz-request-id missing) + - Advanced Prometheus metrics (INFLIGHT_REQUESTS, VOLUME_FILE_COUNT gauges) +- **Production Sprint 3** — Streaming & Multi-node: + - Streaming reads for large files (>1MB) via http_body::Body trait with spawn_blocking + - Meta-only needle reads (NeedleStreamInfo) to avoid loading full body for streaming + - Multi-volume Rust cluster support (RustMultiVolumeCluster test framework) + - TestVolumeMoveHandlesInFlightWrites now uses Rust volume servers + - CI skip list cleaned up (all tests pass with Rust) + +- **Production Sprint 4** — Advanced Features: + - BatchDelete EC shard support (ecx index lookup + ecj journal deletion) + - JPEG EXIF orientation auto-fix on upload (kamadak-exif + image crate) + - Async batched write processing (mpsc queue, up to 128 entries per batch) + - VolumeTierMoveDatToRemote/FromRemote (S3 multipart upload/download) + - S3TierRegistry for managing remote storage backends + - VolumeInfo (.vif) persistence for remote file references +- **Production Sprint 5** — Upload Compatibility: + - TTL query parameter extraction during upload (`ttl=3m`) + - Auto-compression for compressible file types (text/*, .js, .css, .json, .svg, etc.) + - Seaweed-* custom metadata headers stored as needle pairs (JSON, max 64KB) + - Filename extraction from URL path stored in needle name field + - Upload response includes filename + +- **Production Sprint 6** — Storage & Networking: + - Redb disk-backed needle maps (pure Rust, no C deps) via `NeedleMap` enum + - Binary search for `VolumeIncrementalCopy` with `since_ns > 0` + - Proxy/redirect read modes for non-local volumes (master lookup, HTTP proxy, 301 redirect) + +## Remaining Work (Production Readiness) + +No major remaining items. All phases and production sprints are complete. + +## Test Commands + +```bash +# Build +cd seaweed-volume && cargo build --release + +# Run all Go integration tests with Rust volume server +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 1200s ./test/volume_server/grpc/... ./test/volume_server/http/... + +# Run S3 remote storage tests +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 180s -run "TestFetchAndWriteNeedle(FromS3|S3NotFound)" ./test/volume_server/grpc/... + +# Run specific test +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 60s -run "TestName" ./test/volume_server/http/... + +# Run Rust unit tests +cd seaweed-volume && cargo test + +# Test heartbeat with Go master +weed master -port=9333 & +seaweed-volume --port 8080 --master localhost:9333 --dir /tmp/vol1 --max 7 +curl http://localhost:9333/dir/status # should show Rust volume server registered +``` diff --git a/seaweed-volume/MISSING_FEATURES.md b/seaweed-volume/MISSING_FEATURES.md new file mode 100644 index 000000000..807dd945c --- /dev/null +++ b/seaweed-volume/MISSING_FEATURES.md @@ -0,0 +1,288 @@ +# Rust Volume Server — Missing Features Audit + +Comprehensive line-by-line comparison of Go vs Rust volume server. +Generated 2026-03-07 from 4 parallel audits covering HTTP, gRPC, storage, and infrastructure. + +## Executive Summary + +| Area | Total Features | Implemented | Partial | Missing | +|------|---------------|-------------|---------|---------| +| gRPC RPCs | 48 | 43 (90%) | 2 (4%) | 3 (6%) | +| HTTP Handlers | 31 | 12 (39%) | 10 (32%) | 9 (29%) | +| Storage Layer | 22 | 6 (27%) | 7 (32%) | 9 (41%) | +| Infrastructure | 14 | 5 (36%) | 4 (29%) | 5 (36%) | + +--- + +## Priority 1 — Critical for Production + +### P1.1 Streaming / Meta-Only Reads +- **Go**: `ReadNeedleMeta()`, `ReadNeedleData()`, `ReadPagedData()` — reads only metadata or pages of large files +- **Go**: `streamWriteResponseContent()` streams needle data in chunks +- **Go**: `AttemptMetaOnly` / `MustMetaOnly` flags in `ReadOption` +- **Rust**: Reads entire needle into memory always +- **Impact**: OOM on large files; 8MB file = 8MB heap per request +- **Files**: `weed/storage/needle/needle_read.go`, `weed/server/volume_server_handlers_read.go` +- **Effort**: Medium + +### P1.2 Download Proxy/Redirect Fallback (ReadMode) +- **Go**: `ReadMode` config: "local" | "proxy" | "redirect" +- **Go**: `tryProxyToReplica()` probes replicas, `proxyReqToTargetServer()` streams response +- **Rust**: Always returns 404 for non-local volumes +- **Impact**: Clients must handle volume placement themselves; breaks transparent replication +- **Files**: `weed/server/volume_server_handlers_read.go:138-250` +- **Effort**: Medium + +### P1.3 TLS/HTTPS Support +- **Go**: `LoadServerTLS()`, `LoadClientTLS()`, cert/key loading from security.toml +- **Go**: Applied to both HTTP and gRPC servers +- **Rust**: No TLS at all — plain TCP only +- **Impact**: Cannot deploy in secure clusters +- **Files**: `weed/security/tls.go`, `weed/command/volume.go` +- **Effort**: Medium (rustls + tokio-rustls already in Cargo.toml) + +### P1.4 VolumeMarkReadonly/Writable Master Notification +- **Go**: `notifyMasterVolumeReadonly()` updates master with readonly state +- **Rust**: Only sets local in-memory flag +- **Impact**: Master keeps directing writes to readonly volume +- **Files**: `weed/server/volume_grpc_admin.go` +- **Effort**: Low + +### P1.5 Compaction/Maintenance Throttling +- **Go**: `WriteThrottler` with `MaybeSlowdown()` for MB/s rate limiting +- **Rust**: Flags parsed but no throttle implementation +- **Impact**: Compaction/copy operations can saturate disk IO +- **Files**: `weed/util/throttler.go` +- **Effort**: Low + +### P1.6 File Size Limit Enforcement +- **Go**: `fileSizeLimitBytes` checked on upload, returns 400 +- **Rust**: No enforcement — accepts any size +- **Impact**: Can write files larger than volume size limit +- **Files**: `weed/server/volume_server_handlers_write.go` +- **Effort**: Low + +--- + +## Priority 2 — Important for Compatibility + +### P2.1 `ts` Query Param (Custom Timestamps) +- **Go**: Upload and delete accept `ts` query param for custom Last-Modified time +- **Rust**: Always uses current time +- **Impact**: Replication timestamp fidelity; sync from external sources +- **Files**: `weed/server/volume_server_handlers_write.go`, `volume_server_handlers_admin.go` +- **Effort**: Low + +### P2.2 Multipart Form Upload Parsing +- **Go**: `needle.CreateNeedleFromRequest()` parses multipart forms, extracts MIME type, custom headers/pairs +- **Rust**: Reads raw body bytes only — no multipart form parsing for metadata +- **Impact**: MIME type not stored; custom needle pairs not supported +- **Files**: `weed/storage/needle/needle.go:CreateNeedleFromRequest` +- **Effort**: Medium + +### P2.3 JPEG Orientation Auto-Fix +- **Go**: `images.FixJpgOrientation()` on upload when enabled +- **Rust**: Not implemented (flag exists but unused) +- **Impact**: Mobile uploads may display rotated +- **Files**: `weed/images/orientation.go` +- **Effort**: Low (exif crate) + +### P2.4 TTL Expiration Enforcement +- **Go**: Checks `HasTtl()` + `AppendAtNs` against current time on read path +- **Rust**: TTL struct exists but no expiration checking +- **Impact**: Expired needles still served +- **Files**: `weed/storage/needle/volume_ttl.go`, `weed/storage/volume_read.go` +- **Effort**: Low + +### P2.5 Health Check — Master Heartbeat Status +- **Go**: Returns 503 if not heartbeating (can't reach master) +- **Rust**: Only checks `is_stopping` flag +- **Impact**: Load balancers won't detect disconnected volume servers +- **Files**: `weed/server/volume_server.go` +- **Effort**: Low + +### P2.6 Stats Endpoints +- **Go**: `/stats/counter`, `/stats/memory`, `/stats/disk` (whitelist-guarded) +- **Rust**: Not implemented +- **Impact**: No operational visibility +- **Files**: `weed/server/volume_server.go` +- **Effort**: Low + +### P2.7 Webp Image Support +- **Go**: `.webp` included in resize-eligible extensions +- **Rust**: Only `.png`, `.jpg`, `.jpeg`, `.gif` +- **Impact**: Webp images can't be resized on read +- **Files**: `weed/server/volume_server_handlers_read.go` +- **Effort**: Low (add webp feature to image crate) + +### P2.8 preStopSeconds Graceful Drain +- **Go**: Stops heartbeat, waits N seconds, then shuts down servers +- **Rust**: Immediate shutdown on signal +- **Impact**: In-flight requests dropped; Kubernetes readiness race +- **Files**: `weed/command/volume.go` +- **Effort**: Low + +### P2.9 S3 Response Passthrough Headers +- **Go**: `response-content-encoding`, `response-expires`, `response-content-language` query params +- **Rust**: Only handles `response-content-type`, `response-cache-control`, `dl` +- **Impact**: S3-compatible GET requests missing some override headers +- **Files**: `weed/server/volume_server_handlers_read.go` +- **Effort**: Low + +--- + +## Priority 3 — Storage Layer Gaps + +### P3.1 LevelDB Needle Maps +- **Go**: 5 needle map variants: memory, LevelDB, LevelDB-medium, LevelDB-large, sorted-file +- **Rust**: Memory-only needle map +- **Impact**: Large volumes (millions of needles) require too much RAM +- **Files**: `weed/storage/needle_map_leveldb.go` +- **Effort**: High (need LevelDB binding or alternative) + +### P3.2 Async Request Processing +- **Go**: `asyncRequestsChan` with 128-entry queue, worker goroutine for batched writes +- **Rust**: All writes synchronous +- **Impact**: Write throughput limited by fsync latency +- **Files**: `weed/storage/needle/async_request.go` +- **Effort**: Medium + +### P3.3 Volume Scrubbing (Data Integrity) +- **Go**: `ScrubIndex()`, `scrubVolumeData()` — full data + index verification +- **Rust**: Stub only in gRPC (returns OK without actual scrubbing) +- **Impact**: No way to verify data integrity +- **Files**: `weed/storage/volume_checking.go`, `weed/storage/idx/check.go` +- **Effort**: Medium + +### P3.4 Volume Backup / Sync +- **Go**: Streaming backup, binary search for last modification, index generation scanner +- **Rust**: Not implemented +- **Impact**: No backup/restore capability +- **Files**: `weed/storage/volume_backup.go` +- **Effort**: Medium + +### P3.5 Volume Info (.vif) Persistence +- **Go**: `.vif` files store tier/remote metadata, readonly state persists across restarts +- **Rust**: No `.vif` support; readonly is in-memory only +- **Impact**: Readonly state lost on restart; no tier metadata +- **Files**: `weed/storage/volume_info/volume_info.go` +- **Effort**: Low + +### P3.6 Disk Location Features +- **Go**: Directory UUID tracking, disk space monitoring, min-free-space enforcement, tag-based grouping +- **Rust**: Basic directory only +- **Impact**: No disk-full protection +- **Files**: `weed/storage/disk_location.go` +- **Effort**: Medium + +### P3.7 Compact Map (Memory-Efficient Needle Map) +- **Go**: `CompactMap` with overflow handling for memory optimization +- **Rust**: Uses standard HashMap +- **Impact**: Higher memory usage for index +- **Files**: `weed/storage/needle_map/compact_map.go` +- **Effort**: Medium + +--- + +## Priority 4 — Nice to Have + +### P4.1 gRPC: VolumeTierMoveDatToRemote / FromRemote +- **Go**: Full streaming implementation for tiering volumes to/from S3 +- **Rust**: Stub returning error +- **Files**: `weed/server/volume_grpc_tier_upload.go`, `volume_grpc_tier_download.go` +- **Effort**: High + +### P4.2 gRPC: Query (S3 Select) +- **Go**: JSON/CSV query over needle data (S3 Select compatible) +- **Rust**: Stub returning error +- **Files**: `weed/server/volume_grpc_query.go` +- **Effort**: High + +### P4.3 FetchAndWriteNeedle — Already Implemented +- **Note**: The gRPC audit incorrectly flagged this as missing. It was implemented in a prior session with full S3 remote storage support. + +### P4.4 JSON Pretty Print + JSONP +- **Go**: `?pretty` query param for indented JSON; `?callback=fn` for JSONP +- **Rust**: Neither supported +- **Effort**: Low + +### P4.5 Request ID Generation +- **Go**: Generates UUID if `x-amz-request-id` header missing, propagates to gRPC context +- **Rust**: Only echoes existing header +- **Effort**: Low + +### P4.6 UI Status Page +- **Go**: Full HTML template with volumes, disks, stats, uptime +- **Rust**: Stub HTML +- **Effort**: Medium + +### P4.7 Advanced Prometheus Metrics +- **Go**: InFlightRequestsGauge, ConcurrentUploadLimit/DownloadLimit gauges, metrics push gateway +- **Rust**: Basic request counter and histogram only +- **Effort**: Low + +### P4.8 Profiling (pprof) +- **Go**: CPU/memory profiling, /debug/pprof endpoints +- **Rust**: Flags parsed but not wired +- **Effort**: Medium (tokio-console or pprof-rs) + +### P4.9 EC Distribution / Rebalancing +- **Go**: 17 files for EC operations including placement strategies, recovery, scrubbing +- **Rust**: 6 files with basic encoder/decoder +- **Effort**: High + +### P4.10 Cookie Mismatch Status Code +- **Go**: Returns 406 Not Acceptable +- **Rust**: Returns 400 Bad Request +- **Effort**: Trivial + +--- + +## Implementation Order Recommendation + +### Sprint 1 — Quick Wins (Low effort, high impact) ✅ DONE +1. ✅ P1.4 VolumeMarkReadonly master notification — triggers immediate heartbeat +2. ✅ P1.5 Compaction throttling — `maybe_throttle_compaction()` method added +3. ✅ P1.6 File size limit enforcement — checks `file_size_limit_bytes` on upload +4. ✅ P2.1 `ts` query param — custom timestamps for upload and delete +5. ✅ P2.4 TTL expiration check — was already implemented +6. ✅ P2.5 Health check heartbeat status — returns 503 if not heartbeating +7. ✅ P2.8 preStopSeconds — graceful drain delay before shutdown +8. ✅ P2.9 S3 passthrough headers — content-encoding, expires, content-language, content-disposition +9. ✅ P3.5 .vif persistence — readonly state persists across restarts +10. ✅ P2.7 Webp support — added to image resize-eligible extensions +11. ~~P4.10 Cookie 406~~ — Go actually uses 404 for HTTP cookie mismatch (406 is gRPC batch delete only) + +### Sprint 2 — Core Read Path (Medium effort) — Partially Done +1. P1.1 Streaming / meta-only reads — TODO (medium effort, no test coverage yet) +2. ✅ P1.2 ReadMode proxy/redirect — was already implemented and tested +3. ✅ P2.2 Multipart form parsing — MIME type extraction from Content-Type header +4. P2.3 JPEG orientation fix — TODO (low effort, needs exif crate) +5. ✅ P2.6 Stats endpoints — /stats/counter, /stats/memory, /stats/disk +6. ✅ P2.7 Webp support — done in Sprint 1 +7. ✅ P4.4 JSON pretty print + JSONP — ?pretty=y and ?callback=fn +8. ✅ P4.5 Request ID generation — generates UUID if x-amz-request-id missing +9. ✅ P4.7 Advanced Prometheus metrics — INFLIGHT_REQUESTS gauge, VOLUME_FILE_COUNT gauge + +### Sprint 3 — Infrastructure (Medium effort) — Partially Done +1. ✅ P1.3 TLS/HTTPS — rustls + tokio-rustls for HTTP, tonic ServerTlsConfig for gRPC +2. P3.2 Async request processing — TODO (medium effort) +3. ✅ P3.3 Volume scrubbing — CRC checksum verification of all needles +4. ✅ P3.6 Disk location features — MinFreeSpace enforcement, background disk monitor + +### Sprint 4 — Storage Advanced (High effort) — Deferred +No integration test coverage for these items. All existing tests pass. +1. P3.1 LevelDB needle maps — needed only for volumes with millions of needles +2. P3.4 Volume backup/sync — streaming backup, binary search +3. P3.7 Compact map — memory optimization for needle index +4. P4.1 VolumeTierMoveDat — full S3 tiering (currently error stub) +5. P4.9 EC distribution — advanced EC placement/rebalancing + +### Sprint 5 — Polish — Deferred +No integration test coverage for these items. +1. P4.2 Query (S3 Select) — JSON/CSV query over needle data +2. ✅ P4.4 JSON pretty/JSONP — done in Sprint 2 +3. ✅ P4.5 Request ID generation — done in Sprint 2 +4. P4.6 UI status page — HTML template with volume/disk/stats info +5. ✅ P4.7 Advanced metrics — done in Sprint 2 +6. P4.8 Profiling — pprof-rs or tokio-console diff --git a/seaweed-volume/PARITY_PLAN.md b/seaweed-volume/PARITY_PLAN.md new file mode 100644 index 000000000..2f37fdabd --- /dev/null +++ b/seaweed-volume/PARITY_PLAN.md @@ -0,0 +1,230 @@ +# Rust Volume Server Parity Plan + +Generated: 2026-03-16 + +## Goal + +Make `seaweed-volume` a drop-in replacement for the Go volume server by: + +- comparing every Go volume-server code path against the Rust implementation, +- recording file-level ownership and verification status, +- closing verified behavior gaps one logic change per commit, +- extending tests so regressions are caught by Go parity suites and Rust unit/integration tests. + +## Ground Truth + +Primary Go sources: + +- `weed/server/volume_server.go` +- `weed/server/volume_server_handlers*.go` +- `weed/server/volume_grpc_*.go` +- `weed/server/constants/volume.go` +- `weed/storage/store*.go` +- `weed/storage/disk_location*.go` +- `weed/storage/volume*.go` +- `weed/storage/needle/*.go` +- `weed/storage/idx/*.go` +- `weed/storage/needle_map*.go` +- `weed/storage/needle_map/*.go` +- `weed/storage/super_block/*.go` +- `weed/storage/erasure_coding/*.go` + +Supporting Go dependencies that affect drop-in behavior: + +- `weed/command/volume.go` +- `weed/security/*.go` +- `weed/images/*.go` +- `weed/stats/*.go` + +Primary Rust sources: + +- `seaweed-volume/src/main.rs` +- `seaweed-volume/src/config.rs` +- `seaweed-volume/src/security.rs` +- `seaweed-volume/src/images.rs` +- `seaweed-volume/src/server/*.rs` +- `seaweed-volume/src/storage/*.rs` +- `seaweed-volume/src/storage/needle/*.rs` +- `seaweed-volume/src/storage/idx/*.rs` +- `seaweed-volume/src/storage/erasure_coding/*.rs` +- `seaweed-volume/src/remote_storage/*.rs` + +## Audit Method + +For each Go file: + +1. Map it to the Rust file or files that should own the same behavior. +2. Compare exported entry points, helper functions, state transitions, wire fields, and persistence side effects. +3. Mark each file `implemented`, `partial`, `missing`, or `needs verification`. +4. Link each behavior to an existing test or add a missing test. +5. Only treat a gap as closed after code review plus local verification. + +## Acceptance Criteria + +The Rust server is a drop-in replacement only when all of these hold: + +- HTTP routes, status codes, headers, and body semantics match Go. +- gRPC RPCs match Go request validation, response fields, streaming behavior, and maintenance/read-only semantics. +- Master heartbeat and topology metadata match Go closely enough that the Go master treats Rust and Go volume servers the same. +- On-disk volume behavior matches Go for normal volumes, EC shards, tiering metadata, and readonly persistence. +- Startup flags and operational endpoints that affect production deployment behave equivalently or are explicitly documented as unsupported. +- Existing Go integration suites pass with `VOLUME_SERVER_IMPL=rust`. + +## File Matrix + +### HTTP server surface + +| Go file | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/server/volume_server.go` | `seaweed-volume/src/main.rs`, `seaweed-volume/src/server/volume_server.rs`, `seaweed-volume/src/server/heartbeat.rs` | partial | startup wiring, routers, heartbeat, shutdown, metrics/debug listeners | +| `weed/server/volume_server_handlers.go` | `seaweed-volume/src/server/volume_server.rs`, `seaweed-volume/src/server/handlers.rs` | needs verification | method dispatch, OPTIONS behavior, public/admin split | +| `weed/server/volume_server_handlers_admin.go` | `seaweed-volume/src/server/handlers.rs` | implemented | `/status`, `/healthz`, stats, server headers | +| `weed/server/volume_server_handlers_helper.go` | `seaweed-volume/src/server/handlers.rs` | needs verification | JSON encoding, request parsing, helper parity | +| `weed/server/volume_server_handlers_read.go` | `seaweed-volume/src/server/handlers.rs` | needs verification | JWT, conditional reads, range reads, proxy/redirect, chunk manifests, image transforms | +| `weed/server/volume_server_handlers_ui.go` | `seaweed-volume/src/server/handlers.rs`, embedded assets | partial | UI payload and HTML parity | +| `weed/server/volume_server_handlers_write.go` | `seaweed-volume/src/server/handlers.rs`, `seaweed-volume/src/images.rs` | needs verification | multipart parsing, metadata, compression, ts, delete semantics | +| `weed/server/constants/volume.go` | `seaweed-volume/src/server/heartbeat.rs`, config defaults | needs verification | heartbeat timing, constants parity | + +### gRPC server surface + +| Go file | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/server/volume_grpc_admin.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | readonly/writable, allocate/delete/configure/mount/unmount | +| `weed/server/volume_grpc_batch_delete.go` | `seaweed-volume/src/server/grpc_server.rs` | implemented | batch delete, EC delete path | +| `weed/server/volume_grpc_client_to_master.go` | `seaweed-volume/src/server/heartbeat.rs` | partial | heartbeat fields, leader changes, metrics settings from master | +| `weed/server/volume_grpc_copy.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | full copy streams | +| `weed/server/volume_grpc_copy_incremental.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | incremental copy binary search, timestamps | +| `weed/server/volume_grpc_erasure_coding.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/erasure_coding/*.rs` | needs verification | shard read/write/delete/mount/unmount/rebuild | +| `weed/server/volume_grpc_query.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | query validation and error parity | +| `weed/server/volume_grpc_read_all.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | read-all ordering and tail semantics | +| `weed/server/volume_grpc_read_write.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/*.rs` | needs verification | blob/meta/page reads, write blob semantics | +| `weed/server/volume_grpc_remote.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | remote fetch/write and tier metadata | +| `weed/server/volume_grpc_scrub.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/*.rs` | needs verification | scrub result semantics | +| `weed/server/volume_grpc_state.go` | `seaweed-volume/src/server/grpc_server.rs` | implemented | GetState/SetState/Status | +| `weed/server/volume_grpc_tail.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | tail streaming and idle timeout | +| `weed/server/volume_grpc_tier_download.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | tier download stream/error paths | +| `weed/server/volume_grpc_tier_upload.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | tier upload stream/error paths | +| `weed/server/volume_grpc_vacuum.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/*.rs` | needs verification | compact/commit/cleanup progress and readonly transitions | + +### Storage and persistence surface + +| Go file group | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/storage/store.go`, `store_state.go` | `seaweed-volume/src/storage/store.rs`, `seaweed-volume/src/server/heartbeat.rs` | partial | topology metadata, disk tags, server id, state persistence | +| `weed/storage/store_vacuum.go` | `seaweed-volume/src/storage/store.rs`, `seaweed-volume/src/storage/volume.rs` | needs verification | vacuum sequencing | +| `weed/storage/store_ec.go`, `store_ec_delete.go`, `store_ec_scrub.go` | `seaweed-volume/src/storage/store.rs`, `seaweed-volume/src/storage/erasure_coding/*.rs` | needs verification | EC lifecycle and scrub behavior | +| `weed/storage/disk_location.go`, `disk_location_ec.go` | `seaweed-volume/src/storage/disk_location.rs`, `seaweed-volume/src/storage/store.rs` | partial | directory UUIDs, tags, load rules, disk space checks | +| `weed/storage/volume.go`, `volume_loading.go` | `seaweed-volume/src/storage/volume.rs` | needs verification | load/reload/readonly/remote metadata | +| `weed/storage/volume_super_block.go` | `seaweed-volume/src/storage/super_block.rs`, `seaweed-volume/src/storage/volume.rs` | implemented | super block parity | +| `weed/storage/volume_read.go`, `volume_read_all.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/server/handlers.rs` | needs verification | full/meta/page reads, TTL, streaming | +| `weed/storage/volume_write.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/server/write_queue.rs` | needs verification | dedup, sync/async writes, metadata flags | +| `weed/storage/volume_vacuum.go` | `seaweed-volume/src/storage/volume.rs` | needs verification | compact and commit parity | +| `weed/storage/volume_backup.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/server/grpc_server.rs` | needs verification | backup/search logic | +| `weed/storage/volume_checking.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/storage/idx/mod.rs`, `seaweed-volume/src/server/grpc_server.rs` | needs verification | scrub and integrity checks | +| `weed/storage/volume_info.go`, `volume_info/volume_info.go`, `volume_tier.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | `.vif` format and tiered file metadata | +| `weed/storage/needle/*.go` | `seaweed-volume/src/storage/needle/*.rs` | needs verification | needle parsing, CRC, TTL, multipart metadata | +| `weed/storage/idx/*.go` | `seaweed-volume/src/storage/idx/*.rs` | needs verification | index walking and binary search | +| `weed/storage/needle_map*.go`, `needle_map/*.go` | `seaweed-volume/src/storage/needle_map.rs` | needs verification | map kind parity, persistence, memory behavior | +| `weed/storage/super_block/*.go` | `seaweed-volume/src/storage/super_block.rs` | implemented | replica placement and TTL metadata | +| `weed/storage/erasure_coding/*.go` | `seaweed-volume/src/storage/erasure_coding/*.rs` | needs verification | EC shard placement, encode/decode, journal deletes | + +### Supporting runtime surface + +| Go file | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/command/volume.go` | `seaweed-volume/src/config.rs`, `seaweed-volume/src/main.rs` | partial | flags, metrics/debug listeners, startup behavior | +| `weed/security/*.go` | `seaweed-volume/src/security.rs`, `seaweed-volume/src/main.rs` | implemented | JWT and TLS loading | +| `weed/images/*.go` | `seaweed-volume/src/images.rs`, `seaweed-volume/src/server/handlers.rs` | implemented | JPEG orientation and transforms | +| `weed/stats/*.go` | `seaweed-volume/src/metrics.rs`, `seaweed-volume/src/server/handlers.rs` | partial | metrics endpoints, push-gateway integration | + +## Verified Gaps As Of 2026-03-08 + +The startup/runtime gaps that were verified in the initial audit are now closed: + +1. Heartbeat metadata parity + Closed by `8ade1c51d` and retained in current HEAD. + +2. Dedicated metrics/debug listener parity + Closed by `fbe0e5829`. + +3. Master-provided metrics push settings + Closed by `fbe0e5829`. + +4. Slow-read tuning parity + Closed by `66e3900dc`. + +There are no remaining verified gaps from the initial startup/runtime audit. The broader line-by-line comparison batches below are still required to either confirm parity or surface new gaps. + +## Execution Status As Of 2026-03-16 + +The file-by-file comparison and verification work executed in this round was: + +1. Startup and harness alignment + Compared `weed/command/volume.go`, `test/volume_server/framework/cluster*.go`, `seaweed-volume/src/config.rs`, and `seaweed-volume/src/main.rs` to ensure the Rust server is invoked with Go-compatible flags and is rebuilt from the current source during parity runs. + +2. HTTP admin surface + Compared `weed/server/volume_server_handlers_admin.go` against `seaweed-volume/src/server/handlers.rs` with emphasis on `/status` payload shape, disk-status fields, and volume ordering. + +3. gRPC admin surface + Compared `weed/server/volume_grpc_admin.go` against `seaweed-volume/src/server/grpc_server.rs` with emphasis on `Ping`, `VolumeConfigure`, readonly/writable flows, and error wrapping. + +4. Storage/index layout + Compared Go index-entry defaults in `weed/storage/types` and `weed/storage/idx/*.go` against the Rust default feature set in `seaweed-volume/Cargo.toml` and the Rust index reader/writer paths to confirm default binaries use the same offset width. + +5. End-to-end parity verification + Re-ran the Go HTTP and gRPC integration suites with `VOLUME_SERVER_IMPL=rust` after each fix to confirm wire-level compatibility. + +### Verified mismatches closed in this round + +- Rust parity runs could reuse a stale `weed-volume` binary across test invocations, hiding source and feature changes from the Go harness. +- Rust defaulted to 5-byte index offsets, while the default Go `go build` path uses 4-byte offsets unless built with `-tags 5BytesOffset`. +- Rust `/status` omitted Go fields in both `Volumes` and `DiskStatuses`, and did not sort volumes by `Id`. +- Rust `Ping` treated an empty target as a self-ping and only performed a raw gRPC connect for filer targets; Go returns `remote_time_ns=0` for the empty request and performs a real filer `Ping` RPC. +- Rust `VolumeNeedleStatus` dropped stored TTL metadata and reported `data_size` instead of Go’s `Size` field. +- Rust multipart uploads ignored form fields such as `ts`, `ttl`, and `cm`, and also ignored part-level `Content-Encoding` and `Content-MD5`. +- Rust only treated `dl=true` and `dl=1` as truthy, while Go accepts the full `strconv.ParseBool` set such as `dl=t` and `dl=True`. + +### Verification commands + +- `VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 1200s ./test/volume_server/http/...` +- `VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 1200s ./test/volume_server/grpc/...` + +## Execution Plan + +### Batch 1: startup and heartbeat + +- Compare `weed/command/volume.go`, `weed/server/volume_server.go`, `weed/server/volume_grpc_client_to_master.go`, `weed/storage/store.go`, and `weed/storage/disk_location.go`. +- Close metadata and startup parity gaps that affect master registration and deployment compatibility. +- Add Rust unit tests for heartbeat payloads and config wiring. + +### Batch 2: HTTP read path + +- Compare `volume_server_handlers_read.go`, `volume_server_handlers_helper.go`, and related storage read functions line by line. +- Verify JWT, path parsing, proxy/redirect, ranges, streaming, chunk manifests, image transforms, and response-header overrides. +- Extend `test/volume_server/http/...` and Rust handler tests where parity is not covered. + +### Batch 3: HTTP write/delete path + +- Compare `volume_server_handlers_write.go` and write-related storage functions. +- Verify multipart behavior, metadata, md5, compression, unchanged writes, delete edge cases, and timestamp handling. + +### Batch 4: gRPC admin and lifecycle + +- Compare `volume_grpc_admin.go`, `volume_grpc_state.go`, and `volume_grpc_vacuum.go`. +- Verify readonly/writable flows, maintenance mode, status payloads, mount/unmount/delete/configure, and vacuum transitions. + +### Batch 5: gRPC data movement + +- Compare `volume_grpc_read_write.go`, `copy*.go`, `read_all.go`, `tail.go`, `remote.go`, and `query.go`. +- Verify stream framing, binary search, idle timeout, and remote-storage semantics. + +### Batch 6: storage internals + +- Compare all `weed/storage` volume, needle, idx, needle map, and EC files line by line. +- Focus on persistence rules, readonly semantics, TTL, recovery/scrub, backup, and memory/disk map behavior. + +## Commit Strategy + +- One commit for the audit/plan document if the document itself changes. +- One commit per logic fix. +- Every logic commit must include the smallest test addition that proves the new parity claim. diff --git a/seaweed-volume/README.md b/seaweed-volume/README.md new file mode 100644 index 000000000..4367a0722 --- /dev/null +++ b/seaweed-volume/README.md @@ -0,0 +1,140 @@ +# SeaweedFS Volume Server (Rust) + +A drop-in replacement for the [SeaweedFS](https://github.com/seaweedfs/seaweedfs) Go volume server, rewritten in Rust. It uses binary-compatible storage formats (`.dat`, `.idx`, `.vif`) and speaks the same HTTP and gRPC protocols, so it works with an unmodified Go master server. + +## Building + +Requires Rust 1.75+ (2021 edition). + +```bash +cd seaweed-volume +cargo build --release +``` + +The binary is produced at `target/release/seaweed-volume`. + +## Running + +Start a Go master server first, then point the Rust volume server at it: + +```bash +# Minimal +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 + +# Multiple data directories +seaweed-volume --port 8080 --master localhost:9333 \ + --dir /mnt/ssd1,/mnt/ssd2 --max 100,100 --disk ssd + +# With datacenter/rack topology +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 \ + --dataCenter dc1 --rack rack1 + +# With JWT authentication +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 \ + --securityFile /etc/seaweedfs/security.toml + +# With TLS (configured in security.toml via [https.volume] and [grpc.volume] sections) +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 \ + --securityFile /etc/seaweedfs/security.toml +``` + +### Common flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--port` | `8080` | HTTP listen port | +| `--port.grpc` | `port+10000` | gRPC listen port | +| `--master` | `localhost:9333` | Comma-separated master server addresses | +| `--dir` | `/tmp` | Comma-separated data directories | +| `--max` | `8` | Max volumes per directory (comma-separated) | +| `--ip` | auto-detect | Server IP / identifier | +| `--ip.bind` | same as `--ip` | Bind address | +| `--dataCenter` | | Datacenter name | +| `--rack` | | Rack name | +| `--disk` | | Disk type tag: `hdd`, `ssd`, or custom | +| `--index` | `memory` | Needle map type: `memory`, `leveldb`, `leveldbMedium`, `leveldbLarge` | +| `--readMode` | `proxy` | Non-local read mode: `local`, `proxy`, `redirect` | +| `--fileSizeLimitMB` | `256` | Max upload file size | +| `--minFreeSpace` | `1` (percent) | Min free disk space before marking volumes read-only | +| `--securityFile` | | Path to `security.toml` for JWT keys and TLS certs | +| `--metricsPort` | `0` (disabled) | Prometheus metrics endpoint port | +| `--whiteList` | | Comma-separated IPs with write permission | +| `--preStopSeconds` | `10` | Graceful drain period before shutdown | +| `--compactionMBps` | `0` (unlimited) | Compaction I/O rate limit | +| `--pprof` | `false` | Enable pprof HTTP handlers | + +Set `RUST_LOG=debug` (or `trace`, `info`, `warn`) for log level control. +Set `SEAWEED_WRITE_QUEUE=1` to enable batched async write processing. + +## Features + +- **Binary compatible** -- reads and writes the same `.dat`/`.idx`/`.vif` files as the Go server; seamless migration with no data conversion. +- **HTTP + gRPC** -- full implementation of the volume server HTTP API and all gRPC RPCs including streaming operations (copy, tail, incremental copy, vacuum). +- **Master heartbeat** -- bidirectional streaming heartbeat with the Go master server; volume and EC shard registration, leader failover, graceful shutdown deregistration. +- **JWT authentication** -- signing key configuration via `security.toml` with token source precedence (query > header > cookie), file_id claims validation, and separate read/write keys. +- **TLS** -- HTTPS for the HTTP API and mTLS for gRPC, configured through `security.toml`. +- **Erasure coding** -- Reed-Solomon EC shard management: mount/unmount, read, rebuild, copy, delete, and shard-to-volume reconstruction. +- **S3 remote storage** -- `FetchAndWriteNeedle` reads from any S3-compatible backend (AWS, MinIO, Wasabi, Backblaze, etc.) and writes locally. Supports `VolumeTierMoveDatToRemote`/`FromRemote` for tiered storage. +- **Needle map backends** -- in-memory HashMap, LevelDB (via `rusty-leveldb`), or redb (pure Rust disk-backed) needle maps. +- **Image processing** -- on-the-fly resize/crop, JPEG EXIF orientation auto-fix, WebP support. +- **Streaming reads** -- large files (>1MB) are streamed via `spawn_blocking` to avoid blocking the async runtime. +- **Auto-compression** -- compressible file types (text, JSON, CSS, JS, SVG, etc.) are gzip-compressed on upload. +- **Prometheus metrics** -- counters, histograms, and gauges exported at a dedicated metrics port; optional push gateway support. +- **Graceful shutdown** -- SIGINT/SIGTERM handling with configurable `preStopSeconds` drain period. + +## Testing + +### Rust unit tests + +```bash +cd seaweed-volume +cargo test +``` + +### Go integration tests + +The Go test suite can target either the Go or Rust volume server via the `VOLUME_SERVER_IMPL` environment variable: + +```bash +# Run all HTTP + gRPC integration tests against the Rust server +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 1200s \ + ./test/volume_server/grpc/... ./test/volume_server/http/... + +# Run a single test +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 60s \ + -run "TestName" ./test/volume_server/http/... + +# Run S3 remote storage tests +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 180s \ + -run "TestFetchAndWriteNeedle" ./test/volume_server/grpc/... +``` + +## Load testing + +A load test harness is available at `test/volume_server/loadtest/`. See that directory for usage instructions and scenarios. + +## Architecture + +The server runs three listeners concurrently: + +- **HTTP** (Axum 0.7) -- admin and public routers for file upload/download, status, and stats endpoints. +- **gRPC** (Tonic 0.12) -- all `VolumeServer` RPCs from the SeaweedFS protobuf definition. +- **Metrics** (optional) -- Prometheus scrape endpoint on a separate port. + +Key source modules: + +| Path | Description | +|------|-------------| +| `src/main.rs` | Entry point, server startup, signal handling | +| `src/config.rs` | CLI parsing and configuration resolution | +| `src/server/volume_server.rs` | HTTP router setup and middleware | +| `src/server/handlers.rs` | HTTP request handlers (read, write, delete, status) | +| `src/server/grpc_server.rs` | gRPC service implementation | +| `src/server/heartbeat.rs` | Master heartbeat loop | +| `src/storage/volume.rs` | Volume read/write/delete logic | +| `src/storage/needle.rs` | Needle (file entry) serialization | +| `src/storage/store.rs` | Multi-volume store management | +| `src/security.rs` | JWT validation and IP whitelist guard | +| `src/remote_storage/` | S3 remote storage backend | + +See [DEV_PLAN.md](DEV_PLAN.md) for the full development history and feature checklist. diff --git a/seaweed-volume/build.rs b/seaweed-volume/build.rs new file mode 100644 index 000000000..08d5cb392 --- /dev/null +++ b/seaweed-volume/build.rs @@ -0,0 +1,17 @@ +fn main() -> Result<(), Box> { + let out_dir = std::path::PathBuf::from(std::env::var("OUT_DIR")?); + tonic_build::configure() + .build_server(true) + .build_client(true) + .file_descriptor_set_path(out_dir.join("seaweed_descriptor.bin")) + .compile_protos( + &[ + "proto/volume_server.proto", + "proto/master.proto", + "proto/remote.proto", + "../weed/pb/filer.proto", + ], + &["proto/", "../weed/pb/"], + )?; + Ok(()) +} diff --git a/seaweed-volume/proto/master.proto b/seaweed-volume/proto/master.proto new file mode 100644 index 000000000..8289cd233 --- /dev/null +++ b/seaweed-volume/proto/master.proto @@ -0,0 +1,474 @@ +syntax = "proto3"; + +package master_pb; + +option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"; + +import "volume_server.proto"; + +////////////////////////////////////////////////// + +service Seaweed { + rpc SendHeartbeat (stream Heartbeat) returns (stream HeartbeatResponse) { + } + rpc KeepConnected (stream KeepConnectedRequest) returns (stream KeepConnectedResponse) { + } + rpc LookupVolume (LookupVolumeRequest) returns (LookupVolumeResponse) { + } + rpc Assign (AssignRequest) returns (AssignResponse) { + } + rpc StreamAssign (stream AssignRequest) returns (stream AssignResponse) { + } + rpc Statistics (StatisticsRequest) returns (StatisticsResponse) { + } + rpc CollectionList (CollectionListRequest) returns (CollectionListResponse) { + } + rpc CollectionDelete (CollectionDeleteRequest) returns (CollectionDeleteResponse) { + } + rpc VolumeList (VolumeListRequest) returns (VolumeListResponse) { + } + rpc LookupEcVolume (LookupEcVolumeRequest) returns (LookupEcVolumeResponse) { + } + rpc VacuumVolume (VacuumVolumeRequest) returns (VacuumVolumeResponse) { + } + rpc DisableVacuum (DisableVacuumRequest) returns (DisableVacuumResponse) { + } + rpc EnableVacuum (EnableVacuumRequest) returns (EnableVacuumResponse) { + } + rpc VolumeMarkReadonly (VolumeMarkReadonlyRequest) returns (VolumeMarkReadonlyResponse) { + } + rpc GetMasterConfiguration (GetMasterConfigurationRequest) returns (GetMasterConfigurationResponse) { + } + rpc ListClusterNodes (ListClusterNodesRequest) returns (ListClusterNodesResponse) { + } + rpc LeaseAdminToken (LeaseAdminTokenRequest) returns (LeaseAdminTokenResponse) { + } + rpc ReleaseAdminToken (ReleaseAdminTokenRequest) returns (ReleaseAdminTokenResponse) { + } + rpc Ping (PingRequest) returns (PingResponse) { + } + rpc RaftListClusterServers (RaftListClusterServersRequest) returns (RaftListClusterServersResponse) { + } + rpc RaftAddServer (RaftAddServerRequest) returns (RaftAddServerResponse) { + } + rpc RaftRemoveServer (RaftRemoveServerRequest) returns (RaftRemoveServerResponse) { + } + rpc RaftLeadershipTransfer (RaftLeadershipTransferRequest) returns (RaftLeadershipTransferResponse) { + } + rpc VolumeGrow (VolumeGrowRequest) returns (VolumeGrowResponse) { + } +} + +////////////////////////////////////////////////// + +message DiskTag { + uint32 disk_id = 1; + repeated string tags = 2; +} + +message Heartbeat { + string ip = 1; + uint32 port = 2; + string public_url = 3; + uint64 max_file_key = 5; + string data_center = 6; + string rack = 7; + uint32 admin_port = 8; + repeated VolumeInformationMessage volumes = 9; + // delta volumes + repeated VolumeShortInformationMessage new_volumes = 10; + repeated VolumeShortInformationMessage deleted_volumes = 11; + bool has_no_volumes = 12; + + // erasure coding + repeated VolumeEcShardInformationMessage ec_shards = 16; + // delta erasure coding shards + repeated VolumeEcShardInformationMessage new_ec_shards = 17; + repeated VolumeEcShardInformationMessage deleted_ec_shards = 18; + bool has_no_ec_shards = 19; + + map max_volume_counts = 4; + uint32 grpc_port = 20; + repeated string location_uuids = 21; + string id = 22; // volume server id, independent of ip:port for stable identification + + // state flags + volume_server_pb.VolumeServerState state = 23; + + repeated DiskTag disk_tags = 24; +} + +message HeartbeatResponse { + uint64 volume_size_limit = 1; + string leader = 2; + string metrics_address = 3; + uint32 metrics_interval_seconds = 4; + repeated StorageBackend storage_backends = 5; + repeated string duplicated_uuids = 6; + bool preallocate = 7; +} + +message VolumeInformationMessage { + uint32 id = 1; + uint64 size = 2; + string collection = 3; + uint64 file_count = 4; + uint64 delete_count = 5; + uint64 deleted_byte_count = 6; + bool read_only = 7; + uint32 replica_placement = 8; + uint32 version = 9; + uint32 ttl = 10; + uint32 compact_revision = 11; + int64 modified_at_second = 12; + string remote_storage_name = 13; + string remote_storage_key = 14; + string disk_type = 15; + uint32 disk_id = 16; +} + +message VolumeShortInformationMessage { + uint32 id = 1; + string collection = 3; + uint32 replica_placement = 8; + uint32 version = 9; + uint32 ttl = 10; + string disk_type = 15; + uint32 disk_id = 16; +} + +message VolumeEcShardInformationMessage { + uint32 id = 1; + string collection = 2; + uint32 ec_index_bits = 3; + string disk_type = 4; + uint64 expire_at_sec = 5; // used to record the destruction time of ec volume + uint32 disk_id = 6; + repeated int64 shard_sizes = 7; // optimized: sizes for shards in order of set bits in ec_index_bits +} + +message StorageBackend { + string type = 1; + string id = 2; + map properties = 3; +} + +message Empty { +} + +message SuperBlockExtra { + message ErasureCoding { + uint32 data = 1; + uint32 parity = 2; + repeated uint32 volume_ids = 3; + } + ErasureCoding erasure_coding = 1; +} + +message KeepConnectedRequest { + string client_type = 1; + string client_address = 3; + string version = 4; + string filer_group = 5; + string data_center = 6; + string rack = 7; +} + +message VolumeLocation { + string url = 1; + string public_url = 2; + repeated uint32 new_vids = 3; + repeated uint32 deleted_vids = 4; + string leader = 5; // optional when leader is not itself + string data_center = 6; // optional when DataCenter is in use + uint32 grpc_port = 7; + repeated uint32 new_ec_vids = 8; + repeated uint32 deleted_ec_vids = 9; +} + +message ClusterNodeUpdate { + string node_type = 1; + string address = 2; + bool is_add = 4; + string filer_group = 5; + int64 created_at_ns = 6; +} + +message KeepConnectedResponse { + VolumeLocation volume_location = 1; + ClusterNodeUpdate cluster_node_update = 2; +} + +message LookupVolumeRequest { + repeated string volume_or_file_ids = 1; + string collection = 2; // optional, a bit faster if provided. +} +message LookupVolumeResponse { + message VolumeIdLocation { + string volume_or_file_id = 1; + repeated Location locations = 2; + string error = 3; + string auth = 4; + } + repeated VolumeIdLocation volume_id_locations = 1; +} + +message Location { + string url = 1; + string public_url = 2; + uint32 grpc_port = 3; + string data_center = 4; +} + +message AssignRequest { + uint64 count = 1; + string replication = 2; + string collection = 3; + string ttl = 4; + string data_center = 5; + string rack = 6; + string data_node = 7; + uint32 memory_map_max_size_mb = 8; + uint32 writable_volume_count = 9; + string disk_type = 10; +} + +message VolumeGrowRequest { + uint32 writable_volume_count = 1; + string replication = 2; + string collection = 3; + string ttl = 4; + string data_center = 5; + string rack = 6; + string data_node = 7; + uint32 memory_map_max_size_mb = 8; + string disk_type = 9; +} + +message AssignResponse { + string fid = 1; + uint64 count = 4; + string error = 5; + string auth = 6; + repeated Location replicas = 7; + Location location = 8; +} + +message StatisticsRequest { + string replication = 1; + string collection = 2; + string ttl = 3; + string disk_type = 4; +} +message StatisticsResponse { + uint64 total_size = 4; + uint64 used_size = 5; + uint64 file_count = 6; +} + +// +// collection related +// +message Collection { + string name = 1; +} +message CollectionListRequest { + bool include_normal_volumes = 1; + bool include_ec_volumes = 2; +} +message CollectionListResponse { + repeated Collection collections = 1; +} + +message CollectionDeleteRequest { + string name = 1; +} +message CollectionDeleteResponse { +} + +// +// volume related +// +message DiskInfo { + string type = 1; + int64 volume_count = 2; + int64 max_volume_count = 3; + int64 free_volume_count = 4; + int64 active_volume_count = 5; + repeated VolumeInformationMessage volume_infos = 6; + repeated VolumeEcShardInformationMessage ec_shard_infos = 7; + int64 remote_volume_count = 8; + uint32 disk_id = 9; + repeated string tags = 10; +} +message DataNodeInfo { + string id = 1; + map diskInfos = 2; + uint32 grpc_port = 3; + string address = 4; // ip:port for connecting to the volume server +} +message RackInfo { + string id = 1; + repeated DataNodeInfo data_node_infos = 2; + map diskInfos = 3; +} +message DataCenterInfo { + string id = 1; + repeated RackInfo rack_infos = 2; + map diskInfos = 3; +} +message TopologyInfo { + string id = 1; + repeated DataCenterInfo data_center_infos = 2; + map diskInfos = 3; +} +message VolumeListRequest { +} +message VolumeListResponse { + TopologyInfo topology_info = 1; + uint64 volume_size_limit_mb = 2; +} + +message LookupEcVolumeRequest { + uint32 volume_id = 1; +} +message LookupEcVolumeResponse { + uint32 volume_id = 1; + message EcShardIdLocation { + uint32 shard_id = 1; + repeated Location locations = 2; + } + repeated EcShardIdLocation shard_id_locations = 2; +} + +message VacuumVolumeRequest { + float garbage_threshold = 1; + uint32 volume_id = 2; + string collection = 3; +} +message VacuumVolumeResponse { +} + +message DisableVacuumRequest { +} +message DisableVacuumResponse { +} + +message EnableVacuumRequest { +} +message EnableVacuumResponse { +} + +message VolumeMarkReadonlyRequest { + string ip = 1; + uint32 port = 2; + uint32 volume_id = 4; + string collection = 5; + uint32 replica_placement = 6; + uint32 version = 7; + uint32 ttl = 8; + string disk_type = 9; + bool is_readonly = 10; +} +message VolumeMarkReadonlyResponse { +} + +message GetMasterConfigurationRequest { +} +message GetMasterConfigurationResponse { + string metrics_address = 1; + uint32 metrics_interval_seconds = 2; + repeated StorageBackend storage_backends = 3; + string default_replication = 4; + string leader = 5; + uint32 volume_size_limit_m_b = 6; + bool volume_preallocate = 7; + // MIGRATION: fields 8-9 help migrate master.toml [master.maintenance] to admin script plugin. Remove after March 2027. + string maintenance_scripts = 8; + uint32 maintenance_sleep_minutes = 9; +} + +message ListClusterNodesRequest { + string client_type = 1; + string filer_group = 2; + int32 limit = 4; +} +message ListClusterNodesResponse { + message ClusterNode { + string address = 1; + string version = 2; + int64 created_at_ns = 4; + string data_center = 5; + string rack = 6; + } + repeated ClusterNode cluster_nodes = 1; +} + +message LeaseAdminTokenRequest { + int64 previous_token = 1; + int64 previous_lock_time = 2; + string lock_name = 3; + string client_name = 4; + string message = 5; +} +message LeaseAdminTokenResponse { + int64 token = 1; + int64 lock_ts_ns = 2; +} + +message ReleaseAdminTokenRequest { + int64 previous_token = 1; + int64 previous_lock_time = 2; + string lock_name = 3; +} +message ReleaseAdminTokenResponse { +} + +message PingRequest { + string target = 1; // default to ping itself + string target_type = 2; +} +message PingResponse { + int64 start_time_ns = 1; + int64 remote_time_ns = 2; + int64 stop_time_ns = 3; +} + +message RaftAddServerRequest { + string id = 1; + string address = 2; + bool voter = 3; +} +message RaftAddServerResponse { +} + +message RaftRemoveServerRequest { + string id = 1; + bool force = 2; +} +message RaftRemoveServerResponse { +} + +message RaftListClusterServersRequest { +} +message RaftListClusterServersResponse { + message ClusterServers { + string id = 1; + string address = 2; + string suffrage = 3; + bool isLeader = 4; + } + repeated ClusterServers cluster_servers = 1; +} + +message RaftLeadershipTransferRequest { + string target_id = 1; // Optional: target server ID. If empty, transfers to any eligible follower + string target_address = 2; // Optional: target server address. Required if target_id is specified +} +message RaftLeadershipTransferResponse { + string previous_leader = 1; + string new_leader = 2; +} + +message VolumeGrowResponse { +} diff --git a/seaweed-volume/proto/remote.proto b/seaweed-volume/proto/remote.proto new file mode 100644 index 000000000..9d6d81ff5 --- /dev/null +++ b/seaweed-volume/proto/remote.proto @@ -0,0 +1,76 @@ +syntax = "proto3"; + +package remote_pb; + +option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/remote_pb"; +option java_package = "seaweedfs.client"; +option java_outer_classname = "FilerProto"; + +///////////////////////// +// Remote Storage related +///////////////////////// +message RemoteConf { + string type = 1; + string name = 2; + string s3_access_key = 4; + string s3_secret_key = 5; + string s3_region = 6; + string s3_endpoint = 7; + string s3_storage_class = 8; + bool s3_force_path_style = 9; + bool s3_support_tagging = 13; + bool s3_v4_signature = 11; + + string gcs_google_application_credentials = 10; + string gcs_project_id = 12; + + string azure_account_name = 15; + string azure_account_key = 16; + + string backblaze_key_id = 20; + string backblaze_application_key = 21; + string backblaze_endpoint = 22; + string backblaze_region = 23; + + string aliyun_access_key = 25; + string aliyun_secret_key = 26; + string aliyun_endpoint = 27; + string aliyun_region = 28; + + string tencent_secret_id = 30; + string tencent_secret_key = 31; + string tencent_endpoint = 32; + + string baidu_access_key = 35; + string baidu_secret_key = 36; + string baidu_endpoint = 37; + string baidu_region = 38; + + string wasabi_access_key = 40; + string wasabi_secret_key = 41; + string wasabi_endpoint = 42; + string wasabi_region = 43; + + string filebase_access_key = 60; + string filebase_secret_key = 61; + string filebase_endpoint = 62; + + string storj_access_key = 65; + string storj_secret_key = 66; + string storj_endpoint = 67; + + string contabo_access_key = 68; + string contabo_secret_key = 69; + string contabo_endpoint = 70; + string contabo_region = 71; +} + +message RemoteStorageMapping { + map mappings = 1; + string primary_bucket_storage_name = 2; +} +message RemoteStorageLocation { + string name = 1; + string bucket = 2; + string path = 3; +} diff --git a/seaweed-volume/proto/volume_server.proto b/seaweed-volume/proto/volume_server.proto new file mode 100644 index 000000000..bc5d79c69 --- /dev/null +++ b/seaweed-volume/proto/volume_server.proto @@ -0,0 +1,759 @@ +syntax = "proto3"; + +package volume_server_pb; +option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"; + +import "remote.proto"; + +////////////////////////////////////////////////// + +// Persistent state for volume servers. +message VolumeServerState { + // whether the server is in maintenance (i.e. read-only) mode. + bool maintenance = 1; + // incremental version counter + uint32 version = 2; +} + +////////////////////////////////////////////////// + +service VolumeServer { + //Experts only: takes multiple fid parameters. This function does not propagate deletes to replicas. + rpc BatchDelete (BatchDeleteRequest) returns (BatchDeleteResponse) { + } + + rpc VacuumVolumeCheck (VacuumVolumeCheckRequest) returns (VacuumVolumeCheckResponse) { + } + rpc VacuumVolumeCompact (VacuumVolumeCompactRequest) returns (stream VacuumVolumeCompactResponse) { + } + rpc VacuumVolumeCommit (VacuumVolumeCommitRequest) returns (VacuumVolumeCommitResponse) { + } + rpc VacuumVolumeCleanup (VacuumVolumeCleanupRequest) returns (VacuumVolumeCleanupResponse) { + } + + rpc DeleteCollection (DeleteCollectionRequest) returns (DeleteCollectionResponse) { + } + rpc AllocateVolume (AllocateVolumeRequest) returns (AllocateVolumeResponse) { + } + + rpc VolumeSyncStatus (VolumeSyncStatusRequest) returns (VolumeSyncStatusResponse) { + } + rpc VolumeIncrementalCopy (VolumeIncrementalCopyRequest) returns (stream VolumeIncrementalCopyResponse) { + } + + rpc VolumeMount (VolumeMountRequest) returns (VolumeMountResponse) { + } + rpc VolumeUnmount (VolumeUnmountRequest) returns (VolumeUnmountResponse) { + } + rpc VolumeDelete (VolumeDeleteRequest) returns (VolumeDeleteResponse) { + } + rpc VolumeMarkReadonly (VolumeMarkReadonlyRequest) returns (VolumeMarkReadonlyResponse) { + } + rpc VolumeMarkWritable (VolumeMarkWritableRequest) returns (VolumeMarkWritableResponse) { + } + rpc VolumeConfigure (VolumeConfigureRequest) returns (VolumeConfigureResponse) { + } + rpc VolumeStatus (VolumeStatusRequest) returns (VolumeStatusResponse) { + } + + rpc GetState (GetStateRequest) returns (GetStateResponse) { + } + rpc SetState (SetStateRequest) returns (SetStateResponse) { + } + + // copy the .idx .dat files, and mount this volume + rpc VolumeCopy (VolumeCopyRequest) returns (stream VolumeCopyResponse) { + } + rpc ReadVolumeFileStatus (ReadVolumeFileStatusRequest) returns (ReadVolumeFileStatusResponse) { + } + rpc CopyFile (CopyFileRequest) returns (stream CopyFileResponse) { + } + rpc ReceiveFile (stream ReceiveFileRequest) returns (ReceiveFileResponse) { + } + + rpc ReadNeedleBlob (ReadNeedleBlobRequest) returns (ReadNeedleBlobResponse) { + } + rpc ReadNeedleMeta (ReadNeedleMetaRequest) returns (ReadNeedleMetaResponse) { + } + rpc WriteNeedleBlob (WriteNeedleBlobRequest) returns (WriteNeedleBlobResponse) { + } + rpc ReadAllNeedles (ReadAllNeedlesRequest) returns (stream ReadAllNeedlesResponse) { + } + + rpc VolumeTailSender (VolumeTailSenderRequest) returns (stream VolumeTailSenderResponse) { + } + rpc VolumeTailReceiver (VolumeTailReceiverRequest) returns (VolumeTailReceiverResponse) { + } + + // erasure coding + rpc VolumeEcShardsGenerate (VolumeEcShardsGenerateRequest) returns (VolumeEcShardsGenerateResponse) { + } + rpc VolumeEcShardsRebuild (VolumeEcShardsRebuildRequest) returns (VolumeEcShardsRebuildResponse) { + } + rpc VolumeEcShardsCopy (VolumeEcShardsCopyRequest) returns (VolumeEcShardsCopyResponse) { + } + rpc VolumeEcShardsDelete (VolumeEcShardsDeleteRequest) returns (VolumeEcShardsDeleteResponse) { + } + rpc VolumeEcShardsMount (VolumeEcShardsMountRequest) returns (VolumeEcShardsMountResponse) { + } + rpc VolumeEcShardsUnmount (VolumeEcShardsUnmountRequest) returns (VolumeEcShardsUnmountResponse) { + } + rpc VolumeEcShardRead (VolumeEcShardReadRequest) returns (stream VolumeEcShardReadResponse) { + } + rpc VolumeEcBlobDelete (VolumeEcBlobDeleteRequest) returns (VolumeEcBlobDeleteResponse) { + } + rpc VolumeEcShardsToVolume (VolumeEcShardsToVolumeRequest) returns (VolumeEcShardsToVolumeResponse) { + } + rpc VolumeEcShardsInfo (VolumeEcShardsInfoRequest) returns (VolumeEcShardsInfoResponse) { + } + + // tiered storage + rpc VolumeTierMoveDatToRemote (VolumeTierMoveDatToRemoteRequest) returns (stream VolumeTierMoveDatToRemoteResponse) { + } + rpc VolumeTierMoveDatFromRemote (VolumeTierMoveDatFromRemoteRequest) returns (stream VolumeTierMoveDatFromRemoteResponse) { + } + + rpc VolumeServerStatus (VolumeServerStatusRequest) returns (VolumeServerStatusResponse) { + } + rpc VolumeServerLeave (VolumeServerLeaveRequest) returns (VolumeServerLeaveResponse) { + } + + // remote storage + rpc FetchAndWriteNeedle (FetchAndWriteNeedleRequest) returns (FetchAndWriteNeedleResponse) { + } + + // scrubbing + rpc ScrubVolume (ScrubVolumeRequest) returns (ScrubVolumeResponse) { + } + rpc ScrubEcVolume (ScrubEcVolumeRequest) returns (ScrubEcVolumeResponse) { + } + + // query + rpc Query (QueryRequest) returns (stream QueriedStripe) { + } + + rpc VolumeNeedleStatus (VolumeNeedleStatusRequest) returns (VolumeNeedleStatusResponse) { + } + + rpc Ping (PingRequest) returns (PingResponse) { + } + +} + +////////////////////////////////////////////////// + +message BatchDeleteRequest { + repeated string file_ids = 1; + bool skip_cookie_check = 2; +} + +message BatchDeleteResponse { + repeated DeleteResult results = 1; +} +message DeleteResult { + string file_id = 1; + int32 status = 2; + string error = 3; + uint32 size = 4; + uint32 version = 5; +} + +message Empty { +} + +message VacuumVolumeCheckRequest { + uint32 volume_id = 1; +} +message VacuumVolumeCheckResponse { + double garbage_ratio = 1; +} + +message VacuumVolumeCompactRequest { + uint32 volume_id = 1; + int64 preallocate = 2; +} +message VacuumVolumeCompactResponse { + int64 processed_bytes = 1; + float load_avg_1m = 2; +} + +message VacuumVolumeCommitRequest { + uint32 volume_id = 1; +} +message VacuumVolumeCommitResponse { + bool is_read_only = 1; + uint64 volume_size = 2; +} + +message VacuumVolumeCleanupRequest { + uint32 volume_id = 1; +} +message VacuumVolumeCleanupResponse { +} + +message DeleteCollectionRequest { + string collection = 1; +} +message DeleteCollectionResponse { +} + +message AllocateVolumeRequest { + uint32 volume_id = 1; + string collection = 2; + int64 preallocate = 3; + string replication = 4; + string ttl = 5; + uint32 memory_map_max_size_mb = 6; + string disk_type = 7; + uint32 version = 8; +} +message AllocateVolumeResponse { +} + +message VolumeSyncStatusRequest { + uint32 volume_id = 1; +} +message VolumeSyncStatusResponse { + uint32 volume_id = 1; + string collection = 2; + string replication = 4; + string ttl = 5; + uint64 tail_offset = 6; + uint32 compact_revision = 7; + uint64 idx_file_size = 8; + uint32 version = 9; +} + +message VolumeIncrementalCopyRequest { + uint32 volume_id = 1; + uint64 since_ns = 2; +} +message VolumeIncrementalCopyResponse { + bytes file_content = 1; +} + +message VolumeMountRequest { + uint32 volume_id = 1; +} +message VolumeMountResponse { +} + +message VolumeUnmountRequest { + uint32 volume_id = 1; +} +message VolumeUnmountResponse { +} + +message VolumeDeleteRequest { + uint32 volume_id = 1; + bool only_empty = 2; +} +message VolumeDeleteResponse { +} + +message VolumeMarkReadonlyRequest { + uint32 volume_id = 1; + bool persist = 2; +} +message VolumeMarkReadonlyResponse { +} + +message VolumeMarkWritableRequest { + uint32 volume_id = 1; +} +message VolumeMarkWritableResponse { +} + +message VolumeConfigureRequest { + uint32 volume_id = 1; + string replication = 2; +} +message VolumeConfigureResponse { + string error = 1; +} + +message VolumeStatusRequest { + uint32 volume_id = 1; +} +message VolumeStatusResponse { + bool is_read_only = 1; + uint64 volume_size = 2; + uint64 file_count = 3; + uint64 file_deleted_count = 4; +} + +message GetStateRequest { +} +message GetStateResponse { + VolumeServerState state = 1; +} + +message SetStateRequest { + // SetState updates *all* volume server flags at once. Retrieve state with GetState(), + // modify individual flags as required, then call this RPC to update. + VolumeServerState state = 1; +} +message SetStateResponse { + VolumeServerState state = 1; +} + +message VolumeCopyRequest { + uint32 volume_id = 1; + string collection = 2; + string replication = 3; + string ttl = 4; + string source_data_node = 5; + string disk_type = 6; + int64 io_byte_per_second = 7; +} +message VolumeCopyResponse { + uint64 last_append_at_ns = 1; + int64 processed_bytes = 2; +} + +message CopyFileRequest { + uint32 volume_id = 1; + string ext = 2; + uint32 compaction_revision = 3; + uint64 stop_offset = 4; + string collection = 5; + bool is_ec_volume = 6; + bool ignore_source_file_not_found = 7; +} +message CopyFileResponse { + bytes file_content = 1; + int64 modified_ts_ns = 2; +} + +message ReceiveFileRequest { + oneof data { + ReceiveFileInfo info = 1; + bytes file_content = 2; + } +} + +message ReceiveFileInfo { + uint32 volume_id = 1; + string ext = 2; + string collection = 3; + bool is_ec_volume = 4; + uint32 shard_id = 5; + uint64 file_size = 6; +} + +message ReceiveFileResponse { + uint64 bytes_written = 1; + string error = 2; +} + +message ReadNeedleBlobRequest { + uint32 volume_id = 1; + int64 offset = 3; // actual offset + int32 size = 4; +} +message ReadNeedleBlobResponse { + bytes needle_blob = 1; +} + +message ReadNeedleMetaRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; + int64 offset = 3; // actual offset + int32 size = 4; +} +message ReadNeedleMetaResponse { + uint32 cookie = 1; + uint64 last_modified = 2; + uint32 crc = 3; + string ttl = 4; + uint64 append_at_ns = 5; +} + +message WriteNeedleBlobRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; + int32 size = 3; + bytes needle_blob = 4; +} +message WriteNeedleBlobResponse { +} + +message ReadAllNeedlesRequest { + repeated uint32 volume_ids = 1; +} +message ReadAllNeedlesResponse { + uint32 volume_id = 1; + uint64 needle_id = 2; + uint32 cookie = 3; + bytes needle_blob = 5; + bool needle_blob_compressed = 6; + uint64 last_modified = 7; + uint32 crc = 8; + bytes name = 9; + bytes mime = 10; +} + +message VolumeTailSenderRequest { + uint32 volume_id = 1; + uint64 since_ns = 2; + uint32 idle_timeout_seconds = 3; +} +message VolumeTailSenderResponse { + bytes needle_header = 1; + bytes needle_body = 2; + bool is_last_chunk = 3; + uint32 version = 4; +} + +message VolumeTailReceiverRequest { + uint32 volume_id = 1; + uint64 since_ns = 2; + uint32 idle_timeout_seconds = 3; + string source_volume_server = 4; +} +message VolumeTailReceiverResponse { +} + +message VolumeEcShardsGenerateRequest { + uint32 volume_id = 1; + string collection = 2; +} +message VolumeEcShardsGenerateResponse { +} + +message VolumeEcShardsRebuildRequest { + uint32 volume_id = 1; + string collection = 2; +} +message VolumeEcShardsRebuildResponse { + repeated uint32 rebuilt_shard_ids = 1; +} + +message VolumeEcShardsCopyRequest { + uint32 volume_id = 1; + string collection = 2; + repeated uint32 shard_ids = 3; + bool copy_ecx_file = 4; + string source_data_node = 5; + bool copy_ecj_file = 6; + bool copy_vif_file = 7; + uint32 disk_id = 8; // Target disk ID for storing EC shards +} +message VolumeEcShardsCopyResponse { +} + +message VolumeEcShardsDeleteRequest { + uint32 volume_id = 1; + string collection = 2; + repeated uint32 shard_ids = 3; +} +message VolumeEcShardsDeleteResponse { +} + +message VolumeEcShardsMountRequest { + uint32 volume_id = 1; + string collection = 2; + repeated uint32 shard_ids = 3; +} +message VolumeEcShardsMountResponse { +} + +message VolumeEcShardsUnmountRequest { + uint32 volume_id = 1; + repeated uint32 shard_ids = 3; +} +message VolumeEcShardsUnmountResponse { +} + +message VolumeEcShardReadRequest { + uint32 volume_id = 1; + uint32 shard_id = 2; + int64 offset = 3; + int64 size = 4; + uint64 file_key = 5; +} +message VolumeEcShardReadResponse { + bytes data = 1; + bool is_deleted = 2; +} + +message VolumeEcBlobDeleteRequest { + uint32 volume_id = 1; + string collection = 2; + uint64 file_key = 3; + uint32 version = 4; +} +message VolumeEcBlobDeleteResponse { +} + +message VolumeEcShardsToVolumeRequest { + uint32 volume_id = 1; + string collection = 2; +} +message VolumeEcShardsToVolumeResponse { +} + +message VolumeEcShardsInfoRequest { + uint32 volume_id = 1; +} +message VolumeEcShardsInfoResponse { + repeated EcShardInfo ec_shard_infos = 1; + uint64 volume_size = 2; + uint64 file_count = 3; + uint64 file_deleted_count = 4; +} + +message EcShardInfo { + uint32 shard_id = 1; + int64 size = 2; + string collection = 3; + uint32 volume_id = 4; +} + +message ReadVolumeFileStatusRequest { + uint32 volume_id = 1; +} +message ReadVolumeFileStatusResponse { + uint32 volume_id = 1; + uint64 idx_file_timestamp_seconds = 2; + uint64 idx_file_size = 3; + uint64 dat_file_timestamp_seconds = 4; + uint64 dat_file_size = 5; + uint64 file_count = 6; + uint32 compaction_revision = 7; + string collection = 8; + string disk_type = 9; + VolumeInfo volume_info = 10; + uint32 version = 11; +} + +message DiskStatus { + string dir = 1; + uint64 all = 2; + uint64 used = 3; + uint64 free = 4; + float percent_free = 5; + float percent_used = 6; + string disk_type = 7; +} + +message MemStatus { + int32 goroutines = 1; + uint64 all = 2; + uint64 used = 3; + uint64 free = 4; + uint64 self = 5; + uint64 heap = 6; + uint64 stack = 7; +} + +// tired storage on volume servers +message RemoteFile { + string backend_type = 1; + string backend_id = 2; + string key = 3; + uint64 offset = 4; + uint64 file_size = 5; + uint64 modified_time = 6; + string extension = 7; +} +message VolumeInfo { + repeated RemoteFile files = 1; + uint32 version = 2; + string replication = 3; + uint32 bytes_offset = 4; + int64 dat_file_size = 5; // store the original dat file size + uint64 expire_at_sec = 6; // expiration time of ec volume + bool read_only = 7; + EcShardConfig ec_shard_config = 8; // EC shard configuration (optional, null = use default 10+4) +} + +// EcShardConfig specifies erasure coding shard configuration +message EcShardConfig { + uint32 data_shards = 1; // Number of data shards (e.g., 10) + uint32 parity_shards = 2; // Number of parity shards (e.g., 4) +} +message OldVersionVolumeInfo { + repeated RemoteFile files = 1; + uint32 version = 2; + string replication = 3; + uint32 BytesOffset = 4; + int64 dat_file_size = 5; // store the original dat file size + uint64 DestroyTime = 6; // expiration time of ec volume + bool read_only = 7; +} + +// tiered storage +message VolumeTierMoveDatToRemoteRequest { + uint32 volume_id = 1; + string collection = 2; + string destination_backend_name = 3; + bool keep_local_dat_file = 4; +} +message VolumeTierMoveDatToRemoteResponse { + int64 processed = 1; + float processedPercentage = 2; +} + +message VolumeTierMoveDatFromRemoteRequest { + uint32 volume_id = 1; + string collection = 2; + bool keep_remote_dat_file = 3; +} +message VolumeTierMoveDatFromRemoteResponse { + int64 processed = 1; + float processedPercentage = 2; +} + +message VolumeServerStatusRequest { + +} +message VolumeServerStatusResponse { + repeated DiskStatus disk_statuses = 1; + MemStatus memory_status = 2; + string version = 3; + string data_center = 4; + string rack = 5; + VolumeServerState state = 6; +} + +message VolumeServerLeaveRequest { +} +message VolumeServerLeaveResponse { +} + +// remote storage +message FetchAndWriteNeedleRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; + uint32 cookie = 3; + int64 offset = 4; + int64 size = 5; + message Replica { + string url = 1; + string public_url = 2; + int32 grpc_port = 3; + } + repeated Replica replicas = 6; + string auth = 7; + // remote conf + remote_pb.RemoteConf remote_conf = 15; + remote_pb.RemoteStorageLocation remote_location = 16; +} +message FetchAndWriteNeedleResponse { + string e_tag = 1; +} + +enum VolumeScrubMode { + UNKNOWN = 0; + INDEX = 1; + FULL = 2; + LOCAL = 3; +} + +message ScrubVolumeRequest { + VolumeScrubMode mode = 1; + // optional list of volume IDs to scrub. if empty, all volumes for the server are scrubbed. + repeated uint32 volume_ids = 2; + bool mark_broken_volumes_readonly = 3; +} +message ScrubVolumeResponse { + uint64 total_volumes = 1; + uint64 total_files = 2; + repeated uint32 broken_volume_ids = 3; + repeated string details = 4; +} + +message ScrubEcVolumeRequest { + VolumeScrubMode mode = 1; + // optional list of volume IDs to scrub. if empty, all EC volumes for the server are scrubbed. + repeated uint32 volume_ids = 2; +} +message ScrubEcVolumeResponse { + uint64 total_volumes = 1; + uint64 total_files = 2; + repeated uint32 broken_volume_ids = 3; + repeated EcShardInfo broken_shard_infos = 4; + repeated string details = 5; +} + +// select on volume servers +message QueryRequest { + repeated string selections = 1; + repeated string from_file_ids = 2; + message Filter { + string field = 1; + string operand = 2; + string value = 3; + } + Filter filter = 3; + + message InputSerialization { + // NONE | GZIP | BZIP2 + string compression_type = 1; + message CSVInput { + string file_header_info = 1; // Valid values: NONE | USE | IGNORE + string record_delimiter = 2; // Default: \n + string field_delimiter = 3; // Default: , + string quote_character = 4; // Default: " + string quote_escape_character = 5; // Default: " + string comments = 6; // Default: # + // If true, records might contain record delimiters within quote characters + bool allow_quoted_record_delimiter = 7; // default False. + } + message JSONInput { + string type = 1; // Valid values: DOCUMENT | LINES + } + message ParquetInput { + } + + CSVInput csv_input = 2; + JSONInput json_input = 3; + ParquetInput parquet_input = 4; + } + InputSerialization input_serialization = 4; + + message OutputSerialization { + message CSVOutput { + string quote_fields = 1; // Valid values: ALWAYS | ASNEEDED + string record_delimiter = 2; // Default: \n + string field_delimiter = 3; // Default: , + string quote_character = 4; // Default: " + string quote_escape_character = 5; // Default: " + } + message JSONOutput { + string record_delimiter = 1; + } + + CSVOutput csv_output = 2; + JSONOutput json_output = 3; + } + + OutputSerialization output_serialization = 5; +} +message QueriedStripe { + bytes records = 1; +} + +message VolumeNeedleStatusRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; +} +message VolumeNeedleStatusResponse { + uint64 needle_id = 1; + uint32 cookie = 2; + uint32 size = 3; + uint64 last_modified = 4; + uint32 crc = 5; + string ttl = 6; +} + +message PingRequest { + string target = 1; // default to ping itself + string target_type = 2; +} +message PingResponse { + int64 start_time_ns = 1; + int64 remote_time_ns = 2; + int64 stop_time_ns = 3; +} diff --git a/seaweed-volume/src/config.rs b/seaweed-volume/src/config.rs new file mode 100644 index 000000000..ce50b1374 --- /dev/null +++ b/seaweed-volume/src/config.rs @@ -0,0 +1,1697 @@ +use clap::Parser; +use std::net::UdpSocket; +use std::path::{Path, PathBuf}; + +use crate::security::tls::TlsPolicy; + +/// SeaweedFS Volume Server (Rust implementation) +/// +/// Start a volume server to provide storage spaces. +#[derive(Parser, Debug)] +#[command(name = "weed-volume", version, about)] +pub struct Cli { + /// HTTP listen port + #[arg(long = "port", default_value_t = 8080)] + pub port: u16, + + /// gRPC listen port. If 0, defaults to port + 10000. + #[arg(long = "port.grpc", default_value_t = 0)] + pub port_grpc: u16, + + /// Port opened to public. If 0, defaults to same as --port. + #[arg(long = "port.public", default_value_t = 0)] + pub port_public: u16, + + /// IP or server name, also used as identifier. + /// If empty, auto-detected. + #[arg(long = "ip", default_value = "")] + pub ip: String, + + /// Volume server ID. If empty, defaults to ip:port. + #[arg(long = "id", default_value = "")] + pub id: String, + + /// Publicly accessible address. + #[arg(long = "publicUrl", default_value = "")] + pub public_url: String, + + /// IP address to bind to. If empty, defaults to same as --ip. + #[arg(long = "ip.bind", default_value = "")] + pub bind_ip: String, + + /// Comma-separated master server addresses. + #[arg(long = "master", default_value = "localhost:9333")] + pub master: String, + + /// Comma-separated master servers (deprecated, use --master instead). + #[arg(long = "mserver", default_value = "")] + pub mserver: String, + + /// Number of seconds between stop sending heartbeats and stopping the volume server. + #[arg(long = "preStopSeconds", default_value_t = 10)] + pub pre_stop_seconds: u32, + + /// Connection idle seconds. + #[arg(long = "idleTimeout", default_value_t = 30)] + pub idle_timeout: u32, + + /// Current volume server's data center name. + #[arg(long = "dataCenter", default_value = "")] + pub data_center: String, + + /// Current volume server's rack name. + #[arg(long = "rack", default_value = "")] + pub rack: String, + + /// Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance. + #[arg(long = "index", default_value = "memory")] + pub index: String, + + /// [hdd|ssd|] hard drive or solid state drive or any tag. + #[arg(long = "disk", default_value = "")] + pub disk: String, + + /// Comma-separated tag groups per data dir; each group uses ':' (e.g. fast:ssd,archive). + #[arg(long = "tags", default_value = "")] + pub tags: String, + + /// Adjust jpg orientation when uploading. + #[arg(long = "images.fix.orientation", default_value_t = false)] + pub fix_jpg_orientation: bool, + + /// [local|proxy|redirect] how to deal with non-local volume. + #[arg(long = "readMode", default_value = "proxy")] + pub read_mode: String, + + /// CPU profile output file. + #[arg(long = "cpuprofile", default_value = "")] + pub cpu_profile: String, + + /// Memory profile output file. + #[arg(long = "memprofile", default_value = "")] + pub mem_profile: String, + + /// Limit background compaction or copying speed in mega bytes per second. + #[arg(long = "compactionMBps", default_value_t = 0)] + pub compaction_mb_per_second: u32, + + /// Limit maintenance (replication/balance) IO rate in MB/s. 0 means no limit. + #[arg(long = "maintenanceMBps", default_value_t = 0)] + pub maintenance_mb_per_second: u32, + + /// Limit file size to avoid out of memory. + #[arg(long = "fileSizeLimitMB", default_value_t = 256)] + pub file_size_limit_mb: u32, + + /// Limit total concurrent upload size in MB, 0 means unlimited. + #[arg(long = "concurrentUploadLimitMB", default_value_t = 0)] + pub concurrent_upload_limit_mb: u32, + + /// Limit total concurrent download size in MB, 0 means unlimited. + #[arg(long = "concurrentDownloadLimitMB", default_value_t = 0)] + pub concurrent_download_limit_mb: u32, + + /// Enable pprof-equivalent HTTP handlers. Precludes --memprofile and --cpuprofile. + #[arg(long = "pprof", default_value_t = false)] + pub pprof: bool, + + /// Prometheus metrics listen port. + #[arg(long = "metricsPort", default_value_t = 0)] + pub metrics_port: u16, + + /// Metrics listen IP. If empty, defaults to same as --ip.bind. + #[arg(long = "metricsIp", default_value = "")] + pub metrics_ip: String, + + /// Directories to store data files. dir[,dir]... + /// If empty, defaults to the platform temp directory (Go's os.TempDir()). + #[arg(long = "dir", default_value = "")] + pub dir: String, + + /// Directory to store .idx files. + #[arg(long = "dir.idx", default_value = "")] + pub dir_idx: String, + + /// Maximum numbers of volumes, count[,count]... + /// If set to zero, the limit will be auto configured as free disk space divided by volume size. + #[arg(long = "max", default_value = "8")] + pub max: String, + + /// Comma separated IP addresses having write permission. No limit if empty. + #[arg(long = "whiteList", default_value = "")] + pub white_list: String, + + /// Minimum free disk space (default to 1%). Low disk space will mark all volumes as ReadOnly. + /// Deprecated: use --minFreeSpace instead. + #[arg(long = "minFreeSpacePercent", default_value = "1")] + pub min_free_space_percent: String, + + /// Min free disk space (value<=100 as percentage like 1, other as human readable bytes, like 10GiB). + /// Low disk space will mark all volumes as ReadOnly. + #[arg(long = "minFreeSpace", default_value = "")] + pub min_free_space: String, + + /// Inflight upload data wait timeout of volume servers. + #[arg(long = "inflightUploadDataTimeout", default_value = "60s")] + pub inflight_upload_data_timeout: String, + + /// Inflight download data wait timeout of volume servers. + #[arg(long = "inflightDownloadDataTimeout", default_value = "60s")] + pub inflight_download_data_timeout: String, + + /// if true, prevents slow reads from blocking other requests, + /// but large file read P99 latency will increase. + #[arg(long = "hasSlowRead", default_value_t = true)] + pub has_slow_read: bool, + + /// larger values can optimize query performance but will increase memory usage. + /// Use with hasSlowRead normally. + #[arg(long = "readBufferSizeMB", default_value_t = 4)] + pub read_buffer_size_mb: u32, + + /// Alive time for leveldb (default to 0). If leveldb of volume is not accessed in + /// ldbTimeout hours, it will be offloaded to reduce opened files and memory consumption. + #[arg(long = "index.leveldbTimeout", default_value_t = 0)] + pub ldb_timeout: i64, + + /// Serves runtime profiling data on the port specified by --debug.port. + #[arg(long = "debug", default_value_t = false)] + pub debug: bool, + + /// HTTP port for debugging. + #[arg(long = "debug.port", default_value_t = 6060)] + pub debug_port: u16, + + /// Path to security.toml configuration file for JWT signing keys. + #[arg(long = "securityFile", default_value = "")] + pub security_file: String, + + /// A file of command line options, each line in optionName=optionValue format. + #[arg(long = "options", default_value = "")] + pub options: String, +} + +/// Resolved configuration after applying defaults and validation. +#[derive(Debug)] +pub struct VolumeServerConfig { + pub port: u16, + pub grpc_port: u16, + pub public_port: u16, + pub ip: String, + pub bind_ip: String, + pub public_url: String, + pub id: String, + pub masters: Vec, + pub pre_stop_seconds: u32, + pub idle_timeout: u32, + pub data_center: String, + pub rack: String, + pub index_type: NeedleMapKind, + pub disk_type: String, + pub folders: Vec, + pub folder_max_limits: Vec, + pub folder_tags: Vec>, + pub min_free_spaces: Vec, + pub disk_types: Vec, + pub idx_folder: String, + pub white_list: Vec, + pub fix_jpg_orientation: bool, + pub read_mode: ReadMode, + pub cpu_profile: String, + pub mem_profile: String, + pub compaction_byte_per_second: i64, + pub maintenance_byte_per_second: i64, + pub file_size_limit_bytes: i64, + pub concurrent_upload_limit: i64, + pub concurrent_download_limit: i64, + pub inflight_upload_data_timeout: std::time::Duration, + pub inflight_download_data_timeout: std::time::Duration, + pub has_slow_read: bool, + pub read_buffer_size_mb: u32, + pub ldb_timeout: i64, + pub pprof: bool, + pub metrics_port: u16, + pub metrics_ip: String, + pub debug: bool, + pub debug_port: u16, + pub ui_enabled: bool, + pub jwt_signing_key: Vec, + pub jwt_signing_expires_seconds: i64, + pub jwt_read_signing_key: Vec, + pub jwt_read_signing_expires_seconds: i64, + pub https_cert_file: String, + pub https_key_file: String, + pub https_ca_file: String, + pub https_client_enabled: bool, + pub https_client_cert_file: String, + pub https_client_key_file: String, + pub https_client_ca_file: String, + pub grpc_cert_file: String, + pub grpc_key_file: String, + pub grpc_ca_file: String, + pub grpc_allowed_wildcard_domain: String, + pub grpc_volume_allowed_common_names: Vec, + pub tls_policy: TlsPolicy, + /// Enable batched write queue for improved throughput under load. + pub enable_write_queue: bool, + /// Path to security.toml — stored for SIGHUP reload. + pub security_file: String, +} + +pub use crate::storage::needle_map::NeedleMapKind; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReadMode { + Local, + Proxy, + Redirect, +} + +#[derive(Debug, Clone)] +pub enum MinFreeSpace { + Percent(f64), + Bytes(u64), +} + +/// Convert single-dash long options to double-dash for clap compatibility. +/// Go's `flag` package uses `-port`, clap expects `--port`. +/// This allows both `-port 8080` and `--port 8080` to work. +fn normalize_args_vec(args: Vec) -> Vec { + let mut args = args; + // Skip args[0] (binary name). + let mut i = 1; + while i < args.len() { + let arg = &args[i]; + // Stop processing after "--" + if arg == "--" { + break; + } + // Already double-dash or not a flag: leave as-is + if arg.starts_with("--") || !arg.starts_with('-') { + i += 1; + continue; + } + // Single char flags like -h, -V: leave as-is + let without_dash = &arg[1..]; + // Check if it's a single-dash long option: more than 1 char and not a negative number + if without_dash.len() > 1 && !without_dash.starts_with(|c: char| c.is_ascii_digit()) { + // Handle -key=value format + if let Some(eq_pos) = without_dash.find('=') { + let key = &without_dash[..eq_pos]; + if key.len() > 1 { + args[i] = format!("--{}", without_dash); + } + } else { + args[i] = format!("-{}", arg); + } + } + i += 1; + } + args +} + +/// Parse CLI arguments and resolve all defaults — mirroring Go's `runVolume()` + `startVolumeServer()`. +/// +/// Supports `-options ` to load defaults from a file (same format as Go's fla9). +/// CLI arguments take precedence over file values. +pub fn parse_cli() -> VolumeServerConfig { + let args: Vec = std::env::args().collect(); + let normalized = normalize_args_vec(args); + let merged = merge_options_file(normalized); + let cli = Cli::parse_from(merged); + resolve_config(cli) +} + +/// Find `-options`/`--options` in args, parse the referenced file, and inject +/// file-based defaults for any flags not already set on the command line. +/// +/// File format (matching Go's fla9.ParseFile): +/// - One option per line: `key=value`, `key value`, or `key:value` +/// - Lines starting with `#` are comments; blank lines are ignored +/// - Leading `-` on key names is stripped +/// - CLI arguments take precedence over file values +fn merge_options_file(args: Vec) -> Vec { + // Find the options file path from the args + let options_path = find_options_arg(&args); + if options_path.is_empty() { + return args; + } + + let content = match std::fs::read_to_string(&options_path) { + Ok(c) => c, + Err(e) => { + eprintln!( + "WARNING: could not read options file {}: {}", + options_path, e + ); + return args; + } + }; + + // Collect which flags are already explicitly set on the command line. + let mut cli_flags: std::collections::HashSet = std::collections::HashSet::new(); + let mut i = 1; // skip binary name + while i < args.len() { + let arg = &args[i]; + if arg == "--" { + break; + } + if arg.starts_with("--") { + let key = if let Some(eq) = arg.find('=') { + arg[2..eq].to_string() + } else { + arg[2..].to_string() + }; + cli_flags.insert(key); + } else if arg.starts_with('-') && arg.len() > 2 { + // Single-dash long option (already normalized to -- at this point, + // but handle both for safety) + let without_dash = &arg[1..]; + let key = if let Some(eq) = without_dash.find('=') { + without_dash[..eq].to_string() + } else { + without_dash.to_string() + }; + cli_flags.insert(key); + } + i += 1; + } + + // Parse file and append missing options + let mut extra_args: Vec = Vec::new(); + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + + // Split on first `=`, ` `, or `:` + let (name, value) = + if let Some(pos) = trimmed.find(|c: char| c == '=' || c == ' ' || c == ':') { + ( + trimmed[..pos].trim().to_string(), + trimmed[pos + 1..].trim().to_string(), + ) + } else { + (trimmed.to_string(), String::new()) + }; + + // Strip leading dashes from name + let name = name.trim_start_matches('-').to_string(); + if name.is_empty() || name == "options" { + continue; + } + + // Skip if already set on CLI + if cli_flags.contains(&name) { + continue; + } + + extra_args.push(format!("--{}", name)); + if !value.is_empty() { + extra_args.push(value); + } + } + + let mut merged = args; + merged.extend(extra_args); + merged +} + +/// Extract the options file path from args (looks for --options or -options). +fn find_options_arg(args: &[String]) -> String { + for i in 1..args.len() { + if args[i] == "--options" || args[i] == "-options" { + if i + 1 < args.len() { + return args[i + 1].clone(); + } + } + if let Some(rest) = args[i].strip_prefix("--options=") { + return rest.to_string(); + } + if let Some(rest) = args[i].strip_prefix("-options=") { + return rest.to_string(); + } + } + String::new() +} + +/// Parse a duration string like "60s", "5m", "1h" into a std::time::Duration. +fn parse_duration(s: &str) -> std::time::Duration { + let s = s.trim(); + if s.is_empty() { + return std::time::Duration::from_secs(60); + } + if let Some(secs) = s.strip_suffix('s') { + if let Ok(v) = secs.parse::() { + return std::time::Duration::from_secs(v); + } + } + if let Some(mins) = s.strip_suffix('m') { + if let Ok(v) = mins.parse::() { + return std::time::Duration::from_secs(v * 60); + } + } + if let Some(hours) = s.strip_suffix('h') { + if let Ok(v) = hours.parse::() { + return std::time::Duration::from_secs(v * 3600); + } + } + // Fallback: try parsing as raw seconds + if let Ok(v) = s.parse::() { + return std::time::Duration::from_secs(v); + } + std::time::Duration::from_secs(60) +} + +/// Parse minFreeSpace / minFreeSpacePercent into MinFreeSpace values. +/// Mirrors Go's `util.MustParseMinFreeSpace()`. +fn parse_min_free_spaces(min_free_space: &str, min_free_space_percent: &str) -> Vec { + // If --minFreeSpace is provided, use it (takes precedence). + let source = if !min_free_space.is_empty() { + min_free_space + } else { + min_free_space_percent + }; + + source + .split(',') + .map(|s| { + let s = s.trim(); + // Try parsing as a percentage (value <= 100) + if let Ok(v) = s.parse::() { + if v <= 100.0 { + return MinFreeSpace::Percent(v); + } + // Treat as bytes if > 100 + return MinFreeSpace::Bytes(v as u64); + } + // Try parsing human-readable bytes: e.g. "10GiB", "500MiB", "1TiB" + let s_upper = s.to_uppercase(); + if let Some(rest) = s_upper.strip_suffix("TIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("GIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0 * 1024.0 * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("MIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0 * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("KIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("TB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1_000_000_000_000.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("GB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1_000_000_000.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("MB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1_000_000.0) as u64); + } + } + // Default: 1% + MinFreeSpace::Percent(1.0) + }) + .collect() +} + +/// Parse comma-separated tag groups like "fast:ssd,archive" into per-folder tag vectors. +/// Mirrors Go's `parseVolumeTags()`. +fn parse_volume_tags(tags_arg: &str, folder_count: usize) -> Vec> { + if folder_count == 0 { + return vec![]; + } + let tags_arg = tags_arg.trim(); + let tag_entries: Vec<&str> = if tags_arg.is_empty() { + vec![] + } else { + tags_arg.split(',').collect() + }; + + let mut folder_tags: Vec> = vec![vec![]; folder_count]; + + if tag_entries.len() == 1 && !tag_entries[0].is_empty() { + // Single entry: replicate to all folders + let normalized: Vec = tag_entries[0] + .split(':') + .map(|t| t.trim().to_lowercase()) + .filter(|t| !t.is_empty()) + .collect(); + for tags in folder_tags.iter_mut() { + *tags = normalized.clone(); + } + } else { + for (i, tags) in folder_tags.iter_mut().enumerate() { + if i < tag_entries.len() { + *tags = tag_entries[i] + .split(':') + .map(|t| t.trim().to_lowercase()) + .filter(|t| !t.is_empty()) + .collect(); + } + } + } + + folder_tags +} + +fn resolve_config(cli: Cli) -> VolumeServerConfig { + // Backward compatibility: --mserver overrides --master + let master_string = if !cli.mserver.is_empty() { + &cli.mserver + } else { + &cli.master + }; + let masters: Vec = master_string + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + + // Parse folders + let dir_value = if cli.dir.trim().is_empty() { + default_volume_dir() + } else { + cli.dir + }; + let folders: Vec = dir_value + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + let folder_count = folders.len(); + + // Parse max volume counts + let mut folder_max_limits: Vec = cli + .max + .split(',') + .map(|s| { + s.trim().parse::().unwrap_or_else(|_| { + panic!("The max specified in --max is not a valid number: {}", s) + }) + }) + .collect(); + // Replicate single value to all folders + if folder_max_limits.len() == 1 && folder_count > 1 { + let v = folder_max_limits[0]; + folder_max_limits.resize(folder_count, v); + } + if folders.len() != folder_max_limits.len() { + panic!( + "{} directories by --dir, but only {} max is set by --max", + folders.len(), + folder_max_limits.len() + ); + } + + // Parse min free spaces + let mut min_free_spaces = + parse_min_free_spaces(&cli.min_free_space, &cli.min_free_space_percent); + if min_free_spaces.len() == 1 && folder_count > 1 { + let v = min_free_spaces[0].clone(); + min_free_spaces.resize(folder_count, v); + } + if folders.len() != min_free_spaces.len() { + panic!( + "{} directories by --dir, but only {} minFreeSpace values", + folders.len(), + min_free_spaces.len() + ); + } + + // Parse disk types + let mut disk_types: Vec = cli.disk.split(',').map(|s| s.trim().to_string()).collect(); + if disk_types.len() == 1 && folder_count > 1 { + let v = disk_types[0].clone(); + disk_types.resize(folder_count, v); + } + if folders.len() != disk_types.len() { + panic!( + "{} directories by --dir, but only {} disk types by --disk", + folders.len(), + disk_types.len() + ); + } + + // Parse tags + let folder_tags = parse_volume_tags(&cli.tags, folder_count); + + // Resolve IP + let ip = if cli.ip.is_empty() { + detect_host_address() + } else { + cli.ip + }; + + // Resolve bind IP + let bind_ip = if cli.bind_ip.is_empty() { + ip.clone() + } else { + cli.bind_ip + }; + + // Resolve public port + let public_port = if cli.port_public == 0 { + cli.port + } else { + cli.port_public + }; + + // Resolve gRPC port + let grpc_port = if cli.port_grpc == 0 { + 10000 + cli.port + } else { + cli.port_grpc + }; + + // Resolve public URL + let public_url = if cli.public_url.is_empty() { + format!("{}:{}", ip, public_port) + } else { + cli.public_url + }; + + // Resolve volume server ID + let id = if cli.id.is_empty() { + format!("{}:{}", ip, cli.port) + } else { + cli.id + }; + + // Resolve metrics IP + let metrics_ip = if !cli.metrics_ip.is_empty() { + cli.metrics_ip + } else if !bind_ip.is_empty() { + bind_ip.clone() + } else { + ip.clone() + }; + + // Parse index type + let index_type = match cli.index.as_str() { + "memory" => NeedleMapKind::InMemory, + "leveldb" => NeedleMapKind::LevelDb, + "leveldbMedium" => NeedleMapKind::LevelDbMedium, + "leveldbLarge" => NeedleMapKind::LevelDbLarge, + other => panic!( + "Unknown index type: {}. Use memory|leveldb|leveldbMedium|leveldbLarge", + other + ), + }; + + // Parse read mode + let read_mode = match cli.read_mode.as_str() { + "local" => ReadMode::Local, + "proxy" => ReadMode::Proxy, + "redirect" => ReadMode::Redirect, + other => panic!("Unknown readMode: {}. Use local|proxy|redirect", other), + }; + + // Parse security config from TOML file + let sec = parse_security_config(&cli.security_file); + + // Parse whitelist: merge CLI --whiteList with guard.white_list from security.toml + let mut white_list: Vec = cli + .white_list + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + white_list.extend(sec.guard_white_list.iter().cloned()); + + // Parse durations + let inflight_upload_data_timeout = parse_duration(&cli.inflight_upload_data_timeout); + let inflight_download_data_timeout = parse_duration(&cli.inflight_download_data_timeout); + + VolumeServerConfig { + port: cli.port, + grpc_port, + public_port, + ip, + bind_ip, + public_url, + id, + masters, + pre_stop_seconds: cli.pre_stop_seconds, + idle_timeout: cli.idle_timeout, + data_center: cli.data_center, + rack: cli.rack, + index_type, + disk_type: cli.disk, + folders, + folder_max_limits, + folder_tags, + min_free_spaces, + disk_types, + idx_folder: cli.dir_idx, + white_list, + fix_jpg_orientation: cli.fix_jpg_orientation, + read_mode, + cpu_profile: cli.cpu_profile, + mem_profile: cli.mem_profile, + compaction_byte_per_second: cli.compaction_mb_per_second as i64 * 1024 * 1024, + maintenance_byte_per_second: cli.maintenance_mb_per_second as i64 * 1024 * 1024, + file_size_limit_bytes: cli.file_size_limit_mb as i64 * 1024 * 1024, + concurrent_upload_limit: cli.concurrent_upload_limit_mb as i64 * 1024 * 1024, + concurrent_download_limit: cli.concurrent_download_limit_mb as i64 * 1024 * 1024, + inflight_upload_data_timeout, + inflight_download_data_timeout, + has_slow_read: cli.has_slow_read, + read_buffer_size_mb: cli.read_buffer_size_mb, + ldb_timeout: cli.ldb_timeout, + pprof: cli.pprof, + metrics_port: cli.metrics_port, + metrics_ip, + debug: cli.debug, + debug_port: cli.debug_port, + ui_enabled: sec.jwt_signing_key.is_empty() || sec.access_ui, + jwt_signing_key: sec.jwt_signing_key, + jwt_signing_expires_seconds: sec.jwt_signing_expires, + jwt_read_signing_key: sec.jwt_read_signing_key, + jwt_read_signing_expires_seconds: sec.jwt_read_signing_expires, + https_cert_file: sec.https_cert_file, + https_key_file: sec.https_key_file, + https_ca_file: sec.https_ca_file, + https_client_enabled: sec.https_client_enabled, + https_client_cert_file: sec.https_client_cert_file, + https_client_key_file: sec.https_client_key_file, + https_client_ca_file: sec.https_client_ca_file, + grpc_cert_file: sec.grpc_cert_file, + grpc_key_file: sec.grpc_key_file, + grpc_ca_file: sec.grpc_ca_file, + grpc_allowed_wildcard_domain: sec.grpc_allowed_wildcard_domain, + grpc_volume_allowed_common_names: sec.grpc_volume_allowed_common_names, + tls_policy: sec.tls_policy, + enable_write_queue: std::env::var("SEAWEED_WRITE_QUEUE") + .map(|v| v == "1" || v == "true") + .unwrap_or(false), + security_file: cli.security_file, + } +} + +fn default_volume_dir() -> String { + std::env::temp_dir().to_string_lossy().into_owned() +} + +/// Parsed security configuration from security.toml. +#[derive(Debug, Default)] +pub struct SecurityConfig { + pub jwt_signing_key: Vec, + pub jwt_signing_expires: i64, + pub jwt_read_signing_key: Vec, + pub jwt_read_signing_expires: i64, + pub https_cert_file: String, + pub https_key_file: String, + pub https_ca_file: String, + pub https_client_enabled: bool, + pub https_client_cert_file: String, + pub https_client_key_file: String, + pub https_client_ca_file: String, + pub grpc_cert_file: String, + pub grpc_key_file: String, + pub grpc_ca_file: String, + pub grpc_allowed_wildcard_domain: String, + pub grpc_volume_allowed_common_names: Vec, + pub tls_policy: TlsPolicy, + pub access_ui: bool, + /// IPs from [guard] white_list in security.toml + pub guard_white_list: Vec, +} + +const SECURITY_CONFIG_FILE_NAME: &str = "security.toml"; + +/// Parse a security.toml file to extract JWT signing keys and TLS configuration. +/// Format: +/// ```toml +/// [jwt.signing] +/// key = "secret" +/// expires_after_seconds = 60 +/// +/// [jwt.signing.read] +/// key = "read-secret" +/// expires_after_seconds = 60 +/// +/// [https.volume] +/// cert = "/path/to/cert.pem" +/// key = "/path/to/key.pem" +/// ca = "/path/to/ca.pem" +/// +/// [tls] +/// min_version = "TLS 1.2" +/// max_version = "TLS 1.3" +/// cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" +/// +/// [https.client] +/// enabled = true +/// cert = "/path/to/cert.pem" +/// key = "/path/to/key.pem" +/// ca = "/path/to/ca.pem" +/// +/// [grpc] +/// ca = "/path/to/ca.pem" +/// allowed_wildcard_domain = ".example.com" +/// +/// [grpc.volume] +/// cert = "/path/to/cert.pem" +/// key = "/path/to/key.pem" +/// allowed_commonNames = "volume-a.internal,volume-b.internal" +/// ``` +pub fn parse_security_config(path: &str) -> SecurityConfig { + let Some(config_path) = resolve_security_config_path(path) else { + let mut cfg = SecurityConfig::default(); + apply_env_overrides(&mut cfg); + return cfg; + }; + + let content = match std::fs::read_to_string(&config_path) { + Ok(c) => c, + Err(_) => { + let mut cfg = SecurityConfig::default(); + apply_env_overrides(&mut cfg); + return cfg; + } + }; + + let mut cfg = SecurityConfig::default(); + + #[derive(PartialEq)] + enum Section { + None, + JwtSigning, + JwtSigningRead, + HttpsClient, + Grpc, + HttpsVolume, + GrpcVolume, + Tls, + Guard, + Access, + } + + let mut section = Section::None; + + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.starts_with('#') || trimmed.is_empty() { + continue; + } + if trimmed == "[jwt.signing.read]" { + section = Section::JwtSigningRead; + continue; + } + if trimmed == "[jwt.signing]" { + section = Section::JwtSigning; + continue; + } + if trimmed == "[https.client]" { + section = Section::HttpsClient; + continue; + } + if trimmed == "[grpc]" { + section = Section::Grpc; + continue; + } + if trimmed == "[https.volume]" { + section = Section::HttpsVolume; + continue; + } + if trimmed == "[grpc.volume]" { + section = Section::GrpcVolume; + continue; + } + if trimmed == "[tls]" { + section = Section::Tls; + continue; + } + if trimmed == "[guard]" { + section = Section::Guard; + continue; + } + if trimmed == "[access]" { + section = Section::Access; + continue; + } + if trimmed.starts_with('[') { + section = Section::None; + continue; + } + + if let Some((key, value)) = trimmed.split_once('=') { + let key = key.trim(); + let value = value.trim().trim_matches('"'); + match section { + Section::JwtSigningRead => match key { + "key" => cfg.jwt_read_signing_key = value.as_bytes().to_vec(), + "expires_after_seconds" => { + cfg.jwt_read_signing_expires = value.parse().unwrap_or(60) + } + _ => {} + }, + Section::JwtSigning => match key { + "key" => cfg.jwt_signing_key = value.as_bytes().to_vec(), + "expires_after_seconds" => cfg.jwt_signing_expires = value.parse().unwrap_or(10), + _ => {} + }, + Section::HttpsClient => match key { + "enabled" => cfg.https_client_enabled = value.parse().unwrap_or(false), + "cert" => cfg.https_client_cert_file = value.to_string(), + "key" => cfg.https_client_key_file = value.to_string(), + "ca" => cfg.https_client_ca_file = value.to_string(), + _ => {} + }, + Section::Grpc => match key { + "ca" => cfg.grpc_ca_file = value.to_string(), + "allowed_wildcard_domain" => { + cfg.grpc_allowed_wildcard_domain = value.to_string() + } + _ => {} + }, + Section::HttpsVolume => match key { + "cert" => cfg.https_cert_file = value.to_string(), + "key" => cfg.https_key_file = value.to_string(), + "ca" => cfg.https_ca_file = value.to_string(), + _ => {} + }, + Section::GrpcVolume => match key { + "cert" => cfg.grpc_cert_file = value.to_string(), + "key" => cfg.grpc_key_file = value.to_string(), + // Go only reads CA from [grpc], not [grpc.volume] + "allowed_commonNames" => { + cfg.grpc_volume_allowed_common_names = + value.split(',').map(|name| name.to_string()).collect(); + } + _ => {} + }, + Section::Tls => match key { + "min_version" => cfg.tls_policy.min_version = value.to_string(), + "max_version" => cfg.tls_policy.max_version = value.to_string(), + "cipher_suites" => cfg.tls_policy.cipher_suites = value.to_string(), + _ => {} + }, + Section::Guard => match key { + "white_list" => { + cfg.guard_white_list = value + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + _ => {} + }, + Section::Access => match key { + "ui" => cfg.access_ui = value.parse().unwrap_or(false), + _ => {} + }, + Section::None => {} + } + } + } + + // Match Go's v.SetDefault: when a signing key is present but + // expires_after_seconds was never specified, apply Go's defaults. + if !cfg.jwt_signing_key.is_empty() && cfg.jwt_signing_expires == 0 { + cfg.jwt_signing_expires = 10; + } + if !cfg.jwt_read_signing_key.is_empty() && cfg.jwt_read_signing_expires == 0 { + cfg.jwt_read_signing_expires = 60; + } + + // Override with WEED_ environment variables (matches Go's Viper convention: + // prefix WEED_, uppercase, replace . with _). + // e.g. WEED_JWT_SIGNING_KEY overrides [jwt.signing] key + apply_env_overrides(&mut cfg); + + cfg +} + +fn resolve_security_config_path(path: &str) -> Option { + if !path.is_empty() { + return Some(PathBuf::from(path)); + } + + default_security_config_candidates( + std::env::current_dir().ok().as_deref(), + home_dir_from_env().as_deref(), + ) + .into_iter() + .find(|candidate| candidate.is_file()) +} + +fn default_security_config_candidates( + current_dir: Option<&Path>, + home_dir: Option<&Path>, +) -> Vec { + let mut candidates = Vec::new(); + if let Some(dir) = current_dir { + candidates.push(dir.join(SECURITY_CONFIG_FILE_NAME)); + } + if let Some(home) = home_dir { + candidates.push(home.join(".seaweedfs").join(SECURITY_CONFIG_FILE_NAME)); + } + candidates.push(PathBuf::from("/usr/local/etc/seaweedfs").join(SECURITY_CONFIG_FILE_NAME)); + candidates.push(PathBuf::from("/etc/seaweedfs").join(SECURITY_CONFIG_FILE_NAME)); + candidates +} + +fn home_dir_from_env() -> Option { + std::env::var_os("HOME") + .filter(|v| !v.is_empty()) + .map(PathBuf::from) + .or_else(|| { + std::env::var_os("USERPROFILE") + .filter(|v| !v.is_empty()) + .map(PathBuf::from) + }) +} + +/// Apply WEED_ environment variable overrides to a SecurityConfig. +/// Matches Go's Viper convention: WEED_ prefix, uppercase, dots replaced with underscores. +fn apply_env_overrides(cfg: &mut SecurityConfig) { + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_KEY") { + cfg.jwt_signing_key = v.into_bytes(); + } + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_EXPIRES_AFTER_SECONDS") { + cfg.jwt_signing_expires = v.parse().unwrap_or(cfg.jwt_signing_expires); + } + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_READ_KEY") { + cfg.jwt_read_signing_key = v.into_bytes(); + } + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_READ_EXPIRES_AFTER_SECONDS") { + cfg.jwt_read_signing_expires = v.parse().unwrap_or(cfg.jwt_read_signing_expires); + } + if let Ok(v) = std::env::var("WEED_HTTPS_VOLUME_CERT") { + cfg.https_cert_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_VOLUME_KEY") { + cfg.https_key_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_VOLUME_CA") { + cfg.https_ca_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_ENABLED") { + cfg.https_client_enabled = v == "true" || v == "1"; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_CERT") { + cfg.https_client_cert_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_KEY") { + cfg.https_client_key_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_CA") { + cfg.https_client_ca_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_CERT") { + cfg.grpc_cert_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_KEY") { + cfg.grpc_key_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_CA") { + cfg.grpc_ca_file = v; + } else if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_CA") { + cfg.grpc_ca_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_ALLOWED_WILDCARD_DOMAIN") { + cfg.grpc_allowed_wildcard_domain = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_ALLOWED_COMMONNAMES") { + cfg.grpc_volume_allowed_common_names = v.split(',').map(|name| name.to_string()).collect(); + } + if let Ok(v) = std::env::var("WEED_TLS_MIN_VERSION") { + cfg.tls_policy.min_version = v; + } + if let Ok(v) = std::env::var("WEED_TLS_MAX_VERSION") { + cfg.tls_policy.max_version = v; + } + if let Ok(v) = std::env::var("WEED_TLS_CIPHER_SUITES") { + cfg.tls_policy.cipher_suites = v; + } + if let Ok(v) = std::env::var("WEED_GUARD_WHITE_LIST") { + cfg.guard_white_list = v + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + if let Ok(v) = std::env::var("WEED_ACCESS_UI") { + cfg.access_ui = v == "true" || v == "1"; + } +} + +/// Detect the host's IP address. +/// Mirrors Go's `util.DetectedHostAddress()`. +fn detect_host_address() -> String { + // Connect to a remote address to determine the local outbound IP + if let Ok(socket) = UdpSocket::bind("0.0.0.0:0") { + if socket.connect("8.8.8.8:80").is_ok() { + if let Ok(addr) = socket.local_addr() { + return addr.ip().to_string(); + } + } + } + "localhost".to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::OsString; + use std::sync::{Mutex, MutexGuard, OnceLock}; + + fn process_state_lock() -> MutexGuard<'static, ()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() + } + + fn with_temp_env_var(key: &str, value: Option<&str>, f: F) { + let previous = std::env::var_os(key); + match value { + Some(v) => std::env::set_var(key, v), + None => std::env::remove_var(key), + } + f(); + restore_env_var(key, previous); + } + + fn restore_env_var(key: &str, value: Option) { + if let Some(value) = value { + std::env::set_var(key, value); + } else { + std::env::remove_var(key); + } + } + + fn with_temp_current_dir(dir: &Path, f: F) { + let previous = std::env::current_dir().unwrap(); + std::env::set_current_dir(dir).unwrap(); + f(); + std::env::set_current_dir(previous).unwrap(); + } + + fn with_cleared_security_env(f: F) { + const KEYS: &[&str] = &[ + "WEED_JWT_SIGNING_KEY", + "WEED_JWT_SIGNING_EXPIRES_AFTER_SECONDS", + "WEED_JWT_SIGNING_READ_KEY", + "WEED_JWT_SIGNING_READ_EXPIRES_AFTER_SECONDS", + "WEED_HTTPS_VOLUME_CERT", + "WEED_HTTPS_VOLUME_KEY", + "WEED_HTTPS_VOLUME_CA", + "WEED_HTTPS_CLIENT_ENABLED", + "WEED_HTTPS_CLIENT_CERT", + "WEED_HTTPS_CLIENT_KEY", + "WEED_HTTPS_CLIENT_CA", + "WEED_GRPC_VOLUME_CERT", + "WEED_GRPC_VOLUME_KEY", + "WEED_GRPC_CA", + "WEED_GRPC_VOLUME_CA", + "WEED_GRPC_ALLOWED_WILDCARD_DOMAIN", + "WEED_GRPC_VOLUME_ALLOWED_COMMONNAMES", + "WEED_TLS_MIN_VERSION", + "WEED_TLS_MAX_VERSION", + "WEED_TLS_CIPHER_SUITES", + "WEED_GUARD_WHITE_LIST", + "WEED_ACCESS_UI", + ]; + + let previous: Vec<(&str, Option)> = KEYS + .iter() + .map(|key| (*key, std::env::var_os(key))) + .collect(); + + for key in KEYS { + std::env::remove_var(key); + } + + f(); + + for (key, value) in previous { + restore_env_var(key, value); + } + } + + #[test] + fn test_parse_duration() { + assert_eq!(parse_duration("60s"), std::time::Duration::from_secs(60)); + assert_eq!(parse_duration("5m"), std::time::Duration::from_secs(300)); + assert_eq!(parse_duration("1h"), std::time::Duration::from_secs(3600)); + assert_eq!(parse_duration("30"), std::time::Duration::from_secs(30)); + assert_eq!(parse_duration(""), std::time::Duration::from_secs(60)); + } + + #[test] + fn test_parse_min_free_spaces_percent() { + let result = parse_min_free_spaces("", "1"); + assert_eq!(result.len(), 1); + match &result[0] { + MinFreeSpace::Percent(v) => assert!((v - 1.0).abs() < f64::EPSILON), + _ => panic!("Expected Percent"), + } + } + + #[test] + fn test_parse_min_free_spaces_bytes() { + let result = parse_min_free_spaces("10GiB", ""); + assert_eq!(result.len(), 1); + match &result[0] { + MinFreeSpace::Bytes(v) => assert_eq!(*v, 10 * 1024 * 1024 * 1024), + _ => panic!("Expected Bytes"), + } + } + + #[test] + fn test_parse_volume_tags_single() { + let tags = parse_volume_tags("fast:ssd", 3); + assert_eq!(tags.len(), 3); + assert_eq!(tags[0], vec!["fast", "ssd"]); + assert_eq!(tags[1], vec!["fast", "ssd"]); + assert_eq!(tags[2], vec!["fast", "ssd"]); + } + + #[test] + fn test_parse_volume_tags_multi() { + let tags = parse_volume_tags("fast:ssd,archive", 3); + assert_eq!(tags.len(), 3); + assert_eq!(tags[0], vec!["fast", "ssd"]); + assert_eq!(tags[1], vec!["archive"]); + assert_eq!(tags[2], Vec::::new()); + } + + #[test] + fn test_parse_volume_tags_empty() { + let tags = parse_volume_tags("", 2); + assert_eq!(tags.len(), 2); + assert_eq!(tags[0], Vec::::new()); + assert_eq!(tags[1], Vec::::new()); + } + + #[test] + fn test_normalize_args_single_dash_to_double() { + let args = vec![ + "bin".into(), + "-port".into(), + "8080".into(), + "-ip.bind".into(), + "127.0.0.1".into(), + "-dir".into(), + "/data".into(), + ]; + let norm = normalize_args_vec(args); + assert_eq!( + norm, + vec![ + "bin", + "--port", + "8080", + "--ip.bind", + "127.0.0.1", + "--dir", + "/data", + ] + ); + } + + #[test] + fn test_normalize_args_double_dash_unchanged() { + let args = vec![ + "bin".into(), + "--port".into(), + "8080".into(), + "--master".into(), + "localhost:9333".into(), + ]; + let norm = normalize_args_vec(args); + assert_eq!( + norm, + vec!["bin", "--port", "8080", "--master", "localhost:9333",] + ); + } + + #[test] + fn test_normalize_args_single_char_flags_unchanged() { + let args = vec!["bin".into(), "-h".into(), "-V".into()]; + let norm = normalize_args_vec(args); + assert_eq!(norm, vec!["bin", "-h", "-V"]); + } + + #[test] + fn test_normalize_args_equals_format() { + let args = vec!["bin".into(), "-port=8080".into(), "-ip.bind=0.0.0.0".into()]; + let norm = normalize_args_vec(args); + assert_eq!(norm, vec!["bin", "--port=8080", "--ip.bind=0.0.0.0"]); + } + + #[test] + fn test_normalize_args_stop_at_double_dash() { + let args = vec![ + "bin".into(), + "-port".into(), + "8080".into(), + "--".into(), + "-notaflag".into(), + ]; + let norm = normalize_args_vec(args); + assert_eq!(norm, vec!["bin", "--port", "8080", "--", "-notaflag"]); + } + + #[test] + fn test_resolve_config_defaults_dir_to_platform_temp_dir() { + let cfg = resolve_config(Cli::parse_from(["bin"])); + assert_eq!(cfg.folders, vec![default_volume_dir()]); + } + + #[test] + fn test_parse_security_config_access_ui() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[jwt.signing] +key = "secret" + +[access] +ui = true +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.jwt_signing_key, b"secret"); + assert!(cfg.access_ui); + }); + } + + #[test] + fn test_parse_security_config_discovers_current_directory_default() { + let _guard = process_state_lock(); + let tmp = tempfile::TempDir::new().unwrap(); + std::fs::write( + tmp.path().join(SECURITY_CONFIG_FILE_NAME), + r#" +[jwt.signing] +key = "cwd-secret" +"#, + ) + .unwrap(); + + with_temp_current_dir(tmp.path(), || { + with_temp_env_var("WEED_JWT_SIGNING_KEY", None, || { + let cfg = parse_security_config(""); + assert_eq!(cfg.jwt_signing_key, b"cwd-secret"); + }); + }); + } + + #[test] + fn test_parse_security_config_discovers_home_default() { + let _guard = process_state_lock(); + let current_dir = tempfile::TempDir::new().unwrap(); + let home_dir = tempfile::TempDir::new().unwrap(); + let seaweed_home = home_dir.path().join(".seaweedfs"); + std::fs::create_dir_all(&seaweed_home).unwrap(); + std::fs::write( + seaweed_home.join(SECURITY_CONFIG_FILE_NAME), + r#" +[jwt.signing] +key = "home-secret" +"#, + ) + .unwrap(); + + with_temp_current_dir(current_dir.path(), || { + with_temp_env_var("WEED_JWT_SIGNING_KEY", None, || { + with_temp_env_var("HOME", Some(home_dir.path().to_str().unwrap()), || { + let cfg = parse_security_config(""); + assert_eq!(cfg.jwt_signing_key, b"home-secret"); + }); + }); + }); + } + + #[test] + fn test_parse_security_config_uses_grpc_root_ca() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[grpc] +ca = "/etc/seaweedfs/grpc-ca.pem" + +[grpc.volume] +cert = "/etc/seaweedfs/volume-cert.pem" +key = "/etc/seaweedfs/volume-key.pem" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.grpc_ca_file, "/etc/seaweedfs/grpc-ca.pem"); + assert_eq!(cfg.grpc_cert_file, "/etc/seaweedfs/volume-cert.pem"); + assert_eq!(cfg.grpc_key_file, "/etc/seaweedfs/volume-key.pem"); + }); + } + + #[test] + fn test_parse_security_config_uses_grpc_peer_name_policy() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[grpc] +allowed_wildcard_domain = ".example.com" + +[grpc.volume] +allowed_commonNames = "volume-a.internal,volume-b.internal" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.grpc_allowed_wildcard_domain, ".example.com"); + assert_eq!( + cfg.grpc_volume_allowed_common_names, + vec![ + String::from("volume-a.internal"), + String::from("volume-b.internal") + ] + ); + }); + } + + #[test] + fn test_parse_security_config_uses_https_client_settings() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[https.client] +enabled = true +cert = "/etc/seaweedfs/client-cert.pem" +key = "/etc/seaweedfs/client-key.pem" +ca = "/etc/seaweedfs/client-ca.pem" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert!(cfg.https_client_enabled); + assert_eq!(cfg.https_client_cert_file, "/etc/seaweedfs/client-cert.pem"); + assert_eq!(cfg.https_client_key_file, "/etc/seaweedfs/client-key.pem"); + assert_eq!(cfg.https_client_ca_file, "/etc/seaweedfs/client-ca.pem"); + }); + } + + #[test] + fn test_parse_security_config_uses_tls_policy_settings() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[tls] +min_version = "TLS 1.2" +max_version = "TLS 1.3" +cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.tls_policy.min_version, "TLS 1.2"); + assert_eq!(cfg.tls_policy.max_version, "TLS 1.3"); + assert_eq!( + cfg.tls_policy.cipher_suites, + "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" + ); + }); + } + + #[test] + fn test_merge_options_file_basic() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), "port=9999\ndir=/data\nmaster=localhost:9333\n").unwrap(); + + let args = vec![ + "bin".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + // Should contain the original args plus the file-based ones + assert!(merged.contains(&"--port".to_string())); + assert!(merged.contains(&"9999".to_string())); + assert!(merged.contains(&"--dir".to_string())); + assert!(merged.contains(&"/data".to_string())); + } + + #[test] + fn test_merge_options_file_cli_precedence() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), "port=9999\ndir=/data\n").unwrap(); + + let args = vec![ + "bin".into(), + "--port".into(), + "8080".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + // port should NOT be duplicated from file since CLI already set it + let port_count = merged.iter().filter(|a| *a == "--port").count(); + assert_eq!( + port_count, 1, + "CLI port should take precedence, file port skipped" + ); + // dir should be added from file + assert!(merged.contains(&"--dir".to_string())); + } + + #[test] + fn test_merge_options_file_comments_and_blanks() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + "# this is a comment\n\nport=9999\n# another comment\ndir=/data\n", + ) + .unwrap(); + + let args = vec![ + "bin".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + assert!(merged.contains(&"--port".to_string())); + assert!(merged.contains(&"--dir".to_string())); + } + + #[test] + fn test_merge_options_file_with_dashes_in_key() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), "-port=9999\n--dir=/data\nip.bind=0.0.0.0\n").unwrap(); + + let args = vec![ + "bin".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + assert!(merged.contains(&"--port".to_string())); + assert!(merged.contains(&"--dir".to_string())); + assert!(merged.contains(&"--ip.bind".to_string())); + } + + #[test] + fn test_find_options_arg() { + assert_eq!( + find_options_arg(&["bin".into(), "--options".into(), "/tmp/opts".into()]), + "/tmp/opts" + ); + assert_eq!( + find_options_arg(&["bin".into(), "-options".into(), "/tmp/opts".into()]), + "/tmp/opts" + ); + assert_eq!( + find_options_arg(&["bin".into(), "--options=/tmp/opts".into()]), + "/tmp/opts" + ); + assert_eq!( + find_options_arg(&["bin".into(), "--port".into(), "8080".into()]), + "" + ); + } + + #[test] + fn test_env_override_jwt_signing_key() { + let _guard = process_state_lock(); + with_temp_env_var("WEED_JWT_SIGNING_KEY", Some("env-secret"), || { + let cfg = parse_security_config(""); + assert_eq!(cfg.jwt_signing_key, b"env-secret"); + }); + } + + #[test] + fn test_env_override_takes_precedence_over_file() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[jwt.signing] +key = "file-secret" +"#, + ) + .unwrap(); + + with_temp_env_var("WEED_JWT_SIGNING_KEY", Some("env-secret"), || { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.jwt_signing_key, b"env-secret"); + }); + } + + #[test] + fn test_env_override_guard_white_list() { + let _guard = process_state_lock(); + with_temp_env_var( + "WEED_GUARD_WHITE_LIST", + Some("10.0.0.0/8, 192.168.1.0/24"), + || { + let cfg = parse_security_config(""); + assert_eq!(cfg.guard_white_list, vec!["10.0.0.0/8", "192.168.1.0/24"]); + }, + ); + } + + #[test] + fn test_env_override_access_ui() { + let _guard = process_state_lock(); + with_temp_env_var("WEED_ACCESS_UI", Some("true"), || { + let cfg = parse_security_config(""); + assert!(cfg.access_ui); + }); + } +} diff --git a/seaweed-volume/src/images.rs b/seaweed-volume/src/images.rs new file mode 100644 index 000000000..9ad7ca71c --- /dev/null +++ b/seaweed-volume/src/images.rs @@ -0,0 +1,275 @@ +//! JPEG EXIF orientation auto-fix, matching Go's `FixJpgOrientation`. +//! +//! Reads the EXIF orientation tag from JPEG data and rotates/flips the image +//! to normalize it to orientation 1 (top-left). If EXIF parsing fails or +//! orientation is already normal, returns the original data unchanged. + +use std::io::Cursor; + +use image::{DynamicImage, GenericImageView, ImageFormat, RgbaImage}; + +/// EXIF orientation tag values. +/// See: +const TOP_LEFT_SIDE: u32 = 1; +const TOP_RIGHT_SIDE: u32 = 2; +const BOTTOM_RIGHT_SIDE: u32 = 3; +const BOTTOM_LEFT_SIDE: u32 = 4; +const LEFT_SIDE_TOP: u32 = 5; +const RIGHT_SIDE_TOP: u32 = 6; +const RIGHT_SIDE_BOTTOM: u32 = 7; +const LEFT_SIDE_BOTTOM: u32 = 8; + +/// Fix JPEG orientation based on EXIF data. +/// +/// Reads the EXIF orientation tag and applies the appropriate rotation/flip +/// to normalize the image to orientation 1 (top-left). Re-encodes as JPEG. +/// +/// Returns the original data unchanged if: +/// - EXIF data cannot be parsed +/// - No orientation tag is present +/// - Orientation is already 1 (normal) +/// - Image decoding or re-encoding fails +pub fn fix_jpg_orientation(data: &[u8]) -> Vec { + // Parse EXIF data + let orientation = match read_exif_orientation(data) { + Some(o) => o, + None => return data.to_vec(), + }; + + // Orientation 1 means normal — no transformation needed + if orientation == TOP_LEFT_SIDE { + return data.to_vec(); + } + + // Determine rotation angle and flip mode + let (angle, flip_horizontal) = match orientation { + TOP_RIGHT_SIDE => (0, true), + BOTTOM_RIGHT_SIDE => (180, false), + BOTTOM_LEFT_SIDE => (180, true), + LEFT_SIDE_TOP => (-90, true), + RIGHT_SIDE_TOP => (-90, false), + RIGHT_SIDE_BOTTOM => (90, true), + LEFT_SIDE_BOTTOM => (90, false), + _ => return data.to_vec(), + }; + + // Decode the image + let src_image = match image::load_from_memory_with_format(data, ImageFormat::Jpeg) { + Ok(img) => img, + Err(_) => return data.to_vec(), + }; + + // Apply rotation then flip (matching Go's flip(rotate(img, angle), flipMode)) + let transformed = flip_horizontal_if(rotate(src_image, angle), flip_horizontal); + + // Re-encode as JPEG + let mut buf = Cursor::new(Vec::new()); + match transformed.write_to(&mut buf, ImageFormat::Jpeg) { + Ok(_) => buf.into_inner(), + Err(_) => data.to_vec(), + } +} + +/// Read the EXIF orientation tag from JPEG data. +/// Returns None if EXIF cannot be parsed or orientation tag is not present. +fn read_exif_orientation(data: &[u8]) -> Option { + let exif_reader = exif::Reader::new(); + let mut cursor = Cursor::new(data); + let exif_data = exif_reader.read_from_container(&mut cursor).ok()?; + + let orientation_field = exif_data.get_field(exif::Tag::Orientation, exif::In::PRIMARY)?; + match orientation_field.value { + exif::Value::Short(ref v) if !v.is_empty() => Some(v[0] as u32), + _ => orientation_field.value.get_uint(0), + } +} + +/// Rotate an image by the given angle (counter-clockwise, in degrees). +/// Matches Go's rotate function. +fn rotate(img: DynamicImage, angle: i32) -> DynamicImage { + let (width, height) = img.dimensions(); + + match angle { + 90 => { + // 90 degrees counter-clockwise + let new_w = height; + let new_h = width; + let mut out = RgbaImage::new(new_w, new_h); + for y in 0..new_h { + for x in 0..new_w { + out.put_pixel(x, y, img.get_pixel(new_h - 1 - y, x)); + } + } + DynamicImage::ImageRgba8(out) + } + -90 => { + // 90 degrees clockwise (or 270 counter-clockwise) + let new_w = height; + let new_h = width; + let mut out = RgbaImage::new(new_w, new_h); + for y in 0..new_h { + for x in 0..new_w { + out.put_pixel(x, y, img.get_pixel(y, new_w - 1 - x)); + } + } + DynamicImage::ImageRgba8(out) + } + 180 | -180 => { + let mut out = RgbaImage::new(width, height); + for y in 0..height { + for x in 0..width { + out.put_pixel(x, y, img.get_pixel(width - 1 - x, height - 1 - y)); + } + } + DynamicImage::ImageRgba8(out) + } + _ => img, + } +} + +/// Flip the image horizontally if requested. +/// In Go, flipMode 2 == FlipHorizontal. We simplify since only horizontal flip is used. +fn flip_horizontal_if(img: DynamicImage, do_flip: bool) -> DynamicImage { + if !do_flip { + return img; + } + let (width, height) = img.dimensions(); + let mut out = RgbaImage::new(width, height); + for y in 0..height { + for x in 0..width { + out.put_pixel(x, y, img.get_pixel(width - 1 - x, y)); + } + } + DynamicImage::ImageRgba8(out) +} + +/// Returns true if the given MIME type or file path extension indicates a JPEG file. +pub fn is_jpeg(mime_type: &str, path: &str) -> bool { + if mime_type == "image/jpeg" { + return true; + } + let lower = path.to_lowercase(); + lower.ends_with(".jpg") || lower.ends_with(".jpeg") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_non_jpeg_data_returned_unchanged() { + let data = b"not a jpeg file at all"; + let result = fix_jpg_orientation(data); + assert_eq!(result, data); + } + + #[test] + fn test_jpeg_without_exif_returned_unchanged() { + // Create a minimal JPEG without EXIF data + let img = DynamicImage::ImageRgba8(RgbaImage::new(2, 2)); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, ImageFormat::Jpeg).unwrap(); + let jpeg_data = buf.into_inner(); + + let result = fix_jpg_orientation(&jpeg_data); + // Should return data unchanged (no EXIF orientation tag) + // Just verify it's still valid JPEG + assert!(!result.is_empty()); + assert_eq!(&result[0..2], &[0xFF, 0xD8]); // JPEG magic bytes + } + + #[test] + fn test_is_jpeg() { + assert!(is_jpeg("image/jpeg", "")); + assert!(is_jpeg("", "/3,abc.jpg")); + assert!(is_jpeg("", "/3,abc.JPEG")); + assert!(is_jpeg("application/octet-stream", "/3,abc.JPG")); + assert!(!is_jpeg("image/png", "/3,abc.png")); + assert!(!is_jpeg("", "/3,abc.png")); + } + + #[test] + fn test_rotate_180() { + // Create a 2x2 image with distinct pixel colors + let mut img = RgbaImage::new(2, 2); + img.put_pixel(0, 0, image::Rgba([255, 0, 0, 255])); // red top-left + img.put_pixel(1, 0, image::Rgba([0, 255, 0, 255])); // green top-right + img.put_pixel(0, 1, image::Rgba([0, 0, 255, 255])); // blue bottom-left + img.put_pixel(1, 1, image::Rgba([255, 255, 0, 255])); // yellow bottom-right + let dynamic = DynamicImage::ImageRgba8(img); + + let rotated = rotate(dynamic, 180); + let (w, h) = rotated.dimensions(); + assert_eq!((w, h), (2, 2)); + // After 180 rotation: top-left should be yellow, top-right should be blue + assert_eq!(rotated.get_pixel(0, 0), image::Rgba([255, 255, 0, 255])); + assert_eq!(rotated.get_pixel(1, 0), image::Rgba([0, 0, 255, 255])); + assert_eq!(rotated.get_pixel(0, 1), image::Rgba([0, 255, 0, 255])); + assert_eq!(rotated.get_pixel(1, 1), image::Rgba([255, 0, 0, 255])); + } + + #[test] + fn test_rotate_90_ccw() { + // Create 3x2 image (width=3, height=2) + let mut img = RgbaImage::new(3, 2); + img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255])); + img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255])); + img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255])); + img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255])); + img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let rotated = rotate(dynamic, 90); + let (w, h) = rotated.dimensions(); + // 90 CCW: width=3,height=2 -> new_w=2, new_h=3 + assert_eq!((w, h), (2, 3)); + // Top-right (2,0) should move to top-left (0,0) in CCW 90 + assert_eq!(rotated.get_pixel(0, 0)[0], 3); + assert_eq!(rotated.get_pixel(1, 0)[0], 6); + } + + #[test] + fn test_rotate_neg90_cw() { + // Create 3x2 image + let mut img = RgbaImage::new(3, 2); + img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255])); + img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255])); + img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255])); + img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255])); + img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let rotated = rotate(dynamic, -90); + let (w, h) = rotated.dimensions(); + assert_eq!((w, h), (2, 3)); + // -90 (CW 90): top-left (0,0) should go to top-right + assert_eq!(rotated.get_pixel(0, 0)[0], 4); + assert_eq!(rotated.get_pixel(1, 0)[0], 1); + } + + #[test] + fn test_flip_horizontal() { + let mut img = RgbaImage::new(2, 1); + img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let flipped = flip_horizontal_if(dynamic, true); + assert_eq!(flipped.get_pixel(0, 0)[0], 20); + assert_eq!(flipped.get_pixel(1, 0)[0], 10); + } + + #[test] + fn test_flip_horizontal_noop() { + let mut img = RgbaImage::new(2, 1); + img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let not_flipped = flip_horizontal_if(dynamic, false); + assert_eq!(not_flipped.get_pixel(0, 0)[0], 10); + assert_eq!(not_flipped.get_pixel(1, 0)[0], 20); + } +} diff --git a/seaweed-volume/src/lib.rs b/seaweed-volume/src/lib.rs new file mode 100644 index 000000000..c295c983d --- /dev/null +++ b/seaweed-volume/src/lib.rs @@ -0,0 +1,27 @@ +pub mod config; +pub mod images; +pub mod metrics; +pub mod remote_storage; +pub mod security; +pub mod server; +pub mod storage; +pub mod version; + +/// Generated protobuf modules. +pub mod pb { + pub const FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("seaweed_descriptor"); + + pub mod remote_pb { + tonic::include_proto!("remote_pb"); + } + pub mod volume_server_pb { + tonic::include_proto!("volume_server_pb"); + } + pub mod master_pb { + tonic::include_proto!("master_pb"); + } + pub mod filer_pb { + tonic::include_proto!("filer_pb"); + } +} diff --git a/seaweed-volume/src/main.rs b/seaweed-volume/src/main.rs new file mode 100644 index 000000000..a398dbf66 --- /dev/null +++ b/seaweed-volume/src/main.rs @@ -0,0 +1,1051 @@ +use std::sync::{Arc, RwLock}; + +use tracing::{error, info, warn}; + +use seaweed_volume::config::{self, VolumeServerConfig}; +use seaweed_volume::metrics; +use seaweed_volume::pb::volume_server_pb::volume_server_server::VolumeServerServer; +use seaweed_volume::security::tls::{ + build_rustls_server_config, build_rustls_server_config_with_grpc_client_auth, + GrpcClientAuthPolicy, TlsPolicy, +}; +use seaweed_volume::security::{Guard, SigningKey}; +use seaweed_volume::server::debug::build_debug_router; +use seaweed_volume::server::grpc_client::load_outgoing_grpc_tls; +use seaweed_volume::server::grpc_server::VolumeGrpcService; +use seaweed_volume::server::profiling::CpuProfileSession; +use seaweed_volume::server::request_id::GrpcRequestIdLayer; +use seaweed_volume::server::volume_server::{ + build_metrics_router, RuntimeMetricsConfig, VolumeServerState, +}; +use seaweed_volume::server::write_queue::WriteQueue; +use seaweed_volume::storage::store::Store; +use seaweed_volume::storage::types::DiskType; + +use tokio_rustls::TlsAcceptor; + +const GRPC_MAX_MESSAGE_SIZE: usize = 1 << 30; +const GRPC_KEEPALIVE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(60); +const GRPC_KEEPALIVE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); +const GRPC_INITIAL_WINDOW_SIZE: u32 = 16 * 1024 * 1024; +const GRPC_MAX_HEADER_LIST_SIZE: u32 = 8 * 1024 * 1024; +const GRPC_MAX_CONCURRENT_STREAMS: u32 = 1000; + +fn main() { + // Initialize tracing + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), + ) + .init(); + + let config = config::parse_cli(); + seaweed_volume::server::server_stats::init_process_start(); + let cpu_profile = match CpuProfileSession::start(&config) { + Ok(session) => session, + Err(e) => { + error!("{}", e); + std::process::exit(1); + } + }; + info!( + "SeaweedFS Volume Server (Rust) v{}", + seaweed_volume::version::full_version() + ); + + // Register Prometheus metrics + metrics::register_metrics(); + + // Build the tokio runtime and run the async entry point + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("Failed to build tokio runtime"); + + if let Err(e) = rt.block_on(run(config, cpu_profile)) { + error!("Volume server failed: {}", e); + std::process::exit(1); + } +} + +fn build_outgoing_http_client( + config: &VolumeServerConfig, +) -> Result<(reqwest::Client, String), Box> { + let scheme = if config.https_client_enabled { + "https" + } else { + "http" + }; + if !config.https_client_enabled { + return Ok((reqwest::Client::new(), scheme.to_string())); + } + + let mut builder = reqwest::Client::builder(); + if !config.https_client_ca_file.is_empty() { + let ca_pem = std::fs::read(&config.https_client_ca_file).map_err(|e| { + format!( + "Failed to read HTTPS client CA file '{}': {}", + config.https_client_ca_file, e + ) + })?; + let cert = reqwest::Certificate::from_pem(&ca_pem).map_err(|e| { + format!( + "Failed to parse HTTPS client CA PEM '{}': {}", + config.https_client_ca_file, e + ) + })?; + builder = builder.add_root_certificate(cert); + } + + match ( + config.https_client_cert_file.is_empty(), + config.https_client_key_file.is_empty(), + ) { + (true, true) => {} + (false, false) => { + let cert_pem = std::fs::read(&config.https_client_cert_file).map_err(|e| { + format!( + "Failed to read HTTPS client cert file '{}': {}", + config.https_client_cert_file, e + ) + })?; + let key_pem = std::fs::read(&config.https_client_key_file).map_err(|e| { + format!( + "Failed to read HTTPS client key file '{}': {}", + config.https_client_key_file, e + ) + })?; + let mut identity_pem = cert_pem; + if !identity_pem.ends_with(b"\n") { + identity_pem.push(b'\n'); + } + identity_pem.extend_from_slice(&key_pem); + let identity = reqwest::Identity::from_pem(&identity_pem).map_err(|e| { + format!( + "Failed to parse HTTPS client identity '{}'+ '{}': {}", + config.https_client_cert_file, config.https_client_key_file, e + ) + })?; + builder = builder.identity(identity); + } + _ => { + return Err(format!( + "HTTPS client requires both cert and key, got cert='{}' key='{}'", + config.https_client_cert_file, config.https_client_key_file + ) + .into()); + } + } + + Ok((builder.build()?, scheme.to_string())) +} + +fn tls_policy_is_configured(policy: &TlsPolicy) -> bool { + !policy.min_version.is_empty() + || !policy.max_version.is_empty() + || !policy.cipher_suites.is_empty() +} + +fn effective_http_tls_policy(ca_path: &str, configured_policy: &TlsPolicy) -> TlsPolicy { + if ca_path.is_empty() { + TlsPolicy::default() + } else { + configured_policy.clone() + } +} + +fn build_grpc_server_tls_acceptor( + cert_path: &str, + key_path: &str, + ca_path: &str, + tls_policy: &TlsPolicy, + allowed_wildcard_domain: &str, + allowed_common_names: &[String], +) -> Option { + if cert_path.is_empty() || key_path.is_empty() || ca_path.is_empty() { + return None; + } + let client_auth_policy = GrpcClientAuthPolicy { + allowed_common_names: allowed_common_names.to_vec(), + allowed_wildcard_domain: allowed_wildcard_domain.to_string(), + }; + let mut server_config = match build_rustls_server_config_with_grpc_client_auth( + cert_path, + key_path, + ca_path, + tls_policy, + &client_auth_policy, + ) { + Ok(server_config) => server_config, + Err(e) => { + warn!("Failed to build gRPC TLS config: {}", e); + return None; + } + }; + server_config.alpn_protocols = vec![b"h2".to_vec()]; + Some(TlsAcceptor::from(Arc::new(server_config))) +} + +fn build_http_server_tls_acceptor( + config: &VolumeServerConfig, +) -> Result, Box> { + if config.https_cert_file.is_empty() || config.https_key_file.is_empty() { + return Ok(None); + } + + let effective_policy = effective_http_tls_policy(&config.https_ca_file, &config.tls_policy); + let tls_config = match build_rustls_server_config( + &config.https_cert_file, + &config.https_key_file, + &config.https_ca_file, + &effective_policy, + ) { + Ok(tls_config) => tls_config, + Err(e) + if !config.https_ca_file.is_empty() && tls_policy_is_configured(&config.tls_policy) => + { + warn!( + "Failed to apply HTTP TLS policy '{}', falling back to default rustls policy", + e + ); + build_rustls_server_config( + &config.https_cert_file, + &config.https_key_file, + &config.https_ca_file, + &TlsPolicy::default(), + )? + } + Err(e) => return Err(e.into()), + }; + + Ok(Some(TlsAcceptor::from(Arc::new(tls_config)))) +} + +fn build_grpc_server_builder() -> tonic::transport::Server { + tonic::transport::Server::builder() + .http2_keepalive_interval(Some(GRPC_KEEPALIVE_INTERVAL)) + .http2_keepalive_timeout(Some(GRPC_KEEPALIVE_TIMEOUT)) + .max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS)) + .initial_stream_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .initial_connection_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .http2_max_header_list_size(Some(GRPC_MAX_HEADER_LIST_SIZE)) +} + +fn build_volume_grpc_service( + grpc_service: VolumeGrpcService, +) -> VolumeServerServer { + VolumeServerServer::new(grpc_service) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE) +} + +fn apply_idle_timeout( + stream: S, + idle_timeout: std::time::Duration, +) -> std::pin::Pin>> +where + S: tokio::io::AsyncRead + tokio::io::AsyncWrite, +{ + let mut stream = tokio_io_timeout::TimeoutStream::new(stream); + if !idle_timeout.is_zero() { + stream.set_read_timeout(Some(idle_timeout)); + stream.set_write_timeout(Some(idle_timeout)); + } + Box::pin(stream) +} + +async fn run( + config: VolumeServerConfig, + cpu_profile: Option, +) -> Result<(), Box> { + // Initialize the store + let mut store = Store::new(config.index_type); + store.id = config.id.clone(); + store.ip = config.ip.clone(); + store.port = config.port; + store.grpc_port = config.grpc_port; + store.public_url = config.public_url.clone(); + store.data_center = config.data_center.clone(); + store.rack = config.rack.clone(); + + // Build shared state + let guard = Guard::new( + &config.white_list, + SigningKey(config.jwt_signing_key.clone()), + config.jwt_signing_expires_seconds, + SigningKey(config.jwt_read_signing_key.clone()), + config.jwt_read_signing_expires_seconds, + ); + let master_url = config.masters.first().cloned().unwrap_or_default(); + let self_url = format!("{}:{}", config.ip, config.port); + let (http_client, outgoing_http_scheme) = build_outgoing_http_client(&config)?; + let outgoing_grpc_tls = load_outgoing_grpc_tls(&config)?; + + let security_file = config.security_file.clone(); + let cli_white_list = config.white_list.clone(); + + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(guard), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: config.concurrent_upload_limit, + concurrent_download_limit: config.concurrent_download_limit, + inflight_upload_data_timeout: config.inflight_upload_data_timeout, + inflight_download_data_timeout: config.inflight_download_data_timeout, + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + file_size_limit_bytes: config.file_size_limit_bytes, + maintenance_byte_per_second: config.maintenance_byte_per_second, + // Go sets isHeartbeating: true unconditionally at startup + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: !config.masters.is_empty(), + pre_stop_seconds: config.pre_stop_seconds, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + seaweed_volume::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: config.read_mode, + master_url, + master_urls: config.masters.clone(), + self_url, + http_client, + outgoing_http_scheme, + outgoing_grpc_tls, + metrics_runtime: std::sync::RwLock::new(RuntimeMetricsConfig::default()), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: config.fix_jpg_orientation, + has_slow_read: config.has_slow_read, + read_buffer_size_bytes: (config.read_buffer_size_mb.max(1) as usize) * 1024 * 1024, + security_file, + cli_white_list, + state_file_path: if config.folders.is_empty() { + String::new() + } else { + std::path::Path::new(&config.folders[0]) + .join("state.pb") + .to_string_lossy() + .into_owned() + }, + }); + + // Load persisted state from disk if it exists (matches Go's State.Load on startup) + if let Some(saved) = + seaweed_volume::server::grpc_server::load_state_file(&state.state_file_path) + { + state + .maintenance + .store(saved.maintenance, std::sync::atomic::Ordering::Relaxed); + state + .state_version + .store(saved.version, std::sync::atomic::Ordering::Relaxed); + } + + if !config.masters.is_empty() { + let hb_config = seaweed_volume::server::heartbeat::HeartbeatConfig { + ip: config.ip.clone(), + port: config.port, + grpc_port: config.grpc_port, + public_url: config.public_url.clone(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + master_addresses: config.masters.clone(), + pulse_seconds: 5, + }; + seaweed_volume::server::heartbeat::prime_master_configuration(&hb_config, &state).await; + } + + { + let mut store = state.store.write().unwrap(); + for (i, dir) in config.folders.iter().enumerate() { + let idx_dir = if config.idx_folder.is_empty() { + dir.as_str() + } else { + config.idx_folder.as_str() + }; + let max_volumes = config.folder_max_limits[i]; + let disk_type = DiskType::from_string(&config.disk_types[i]); + let tags = config.folder_tags.get(i).cloned().unwrap_or_default(); + + info!( + "Adding storage location: {} (max_volumes={}, disk_type={:?})", + dir, max_volumes, disk_type + ); + let min_free_space = config.min_free_spaces[i].clone(); + store + .add_location(dir, idx_dir, max_volumes, disk_type, min_free_space, tags) + .map_err(|e| format!("Failed to add storage location {}: {}", dir, e))?; + } + } + + // Initialize the batched write queue if enabled + if config.enable_write_queue { + info!("Batched write queue enabled"); + let wq = WriteQueue::new(state.clone(), 128); + let _ = state.write_queue.set(wq); + } + + // Set initial metric gauges for concurrent limits and max volumes + metrics::CONCURRENT_UPLOAD_LIMIT.set(state.concurrent_upload_limit); + metrics::CONCURRENT_DOWNLOAD_LIMIT.set(state.concurrent_download_limit); + { + let store = state.store.read().unwrap(); + let mut max_vols: i64 = 0; + for loc in &store.locations { + max_vols += loc + .max_volume_count + .load(std::sync::atomic::Ordering::Relaxed) as i64; + } + metrics::MAX_VOLUMES.set(max_vols); + } + + // Run initial disk space check + { + let store = state.store.read().unwrap(); + for loc in &store.locations { + loc.check_disk_space(); + } + } + + // Spawn background disk space monitor (checks every 60 seconds) + { + let monitor_state = state.clone(); + tokio::spawn(async move { + let mut interval = tokio::time::interval(std::time::Duration::from_secs(60)); + interval.tick().await; // skip the first immediate tick + loop { + interval.tick().await; + let store = monitor_state.store.read().unwrap(); + for loc in &store.locations { + loc.check_disk_space(); + } + } + }); + } + + // Build HTTP routers + let mut admin_router = seaweed_volume::server::volume_server::build_admin_router_with_ui( + state.clone(), + config.ui_enabled, + ); + if config.pprof { + admin_router = admin_router.merge(build_debug_router()); + } + let admin_addr = format!("{}:{}", config.bind_ip, config.port); + + let public_port = config.public_port; + let needs_public = public_port != config.port; + let http_idle_timeout = std::time::Duration::from_secs(config.idle_timeout as u64); + + let grpc_addr = format!("{}:{}", config.bind_ip, config.grpc_port); + let grpc_tls_acceptor = build_grpc_server_tls_acceptor( + &config.grpc_cert_file, + &config.grpc_key_file, + &config.grpc_ca_file, + &config.tls_policy, + &config.grpc_allowed_wildcard_domain, + &config.grpc_volume_allowed_common_names, + ); + + info!("Starting HTTP server on {}", admin_addr); + info!("Starting gRPC server on {}", grpc_addr); + if needs_public { + info!( + "Starting public HTTP server on {}:{}", + config.bind_ip, public_port + ); + } + + // Set up graceful shutdown via SIGINT/SIGTERM using broadcast channel + let (shutdown_tx, _) = tokio::sync::broadcast::channel::<()>(1); + + let state_shutdown = state.clone(); + let shutdown_tx_clone = shutdown_tx.clone(); + tokio::spawn(async move { + let ctrl_c = tokio::signal::ctrl_c(); + #[cfg(unix)] + { + let mut sigterm = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) + .expect("Failed to install SIGTERM handler"); + tokio::select! { + _ = ctrl_c => { info!("Received SIGINT, shutting down..."); } + _ = sigterm.recv() => { info!("Received SIGTERM, shutting down..."); } + } + } + #[cfg(not(unix))] + { + ctrl_c.await.ok(); + info!("Received shutdown signal..."); + } + *state_shutdown.is_stopping.write().unwrap() = true; + // Wake heartbeat loop immediately so it sends deregister heartbeat + // before the pre_stop delay (matches Go: StopHeartbeat() closes stopChan + // before sleeping preStopSeconds) + state_shutdown.volume_state_notify.notify_one(); + + // Graceful drain: wait pre_stop_seconds before shutting down servers + let pre_stop = state_shutdown.pre_stop_seconds; + if pre_stop > 0 { + info!("Pre-stop: waiting {} seconds before shutdown...", pre_stop); + tokio::time::sleep(std::time::Duration::from_secs(pre_stop as u64)).await; + } + + let _ = shutdown_tx_clone.send(()); + }); + + // Set up SIGHUP handler for config reload (mirrors Go's grace.OnReload) + #[cfg(unix)] + { + let state_reload = state.clone(); + tokio::spawn(async move { + let mut sighup = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::hangup()) + .expect("Failed to install SIGHUP handler"); + loop { + sighup.recv().await; + info!("Received SIGHUP, reloading..."); + + // 1. Load new volumes from disk (Go's LoadNewVolumes) + { + info!("Loading new volume ids..."); + let mut store = state_reload.store.write().unwrap(); + store.load_new_volumes(); + } + + // 2. Reload security config (Go's Reload) + { + info!("Reloading security config..."); + let sec = config::parse_security_config(&state_reload.security_file); + let mut whitelist = state_reload.cli_white_list.clone(); + whitelist.extend(sec.guard_white_list.iter().cloned()); + let mut guard = state_reload.guard.write().unwrap(); + guard.update_whitelist(&whitelist); + } + + // Trigger heartbeat to report new volumes + state_reload.volume_state_notify.notify_one(); + info!("SIGHUP reload complete"); + } + }); + } + + // Build optional TLS acceptor for HTTPS + let https_tls_acceptor = + if !config.https_cert_file.is_empty() && !config.https_key_file.is_empty() { + info!( + "TLS enabled for HTTP server (cert={}, key={})", + config.https_cert_file, config.https_key_file + ); + build_http_server_tls_acceptor(&config)? + } else { + None + }; + + // Spawn all servers concurrently + let admin_listener = tokio::net::TcpListener::bind(&admin_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind HTTP to {}: {}", admin_addr, e)); + let scheme = if https_tls_acceptor.is_some() { + "HTTPS" + } else { + "HTTP" + }; + info!("{} server listening on {}", scheme, admin_addr); + + let http_handle = if let Some(tls_acceptor) = https_tls_acceptor.clone() { + let mut shutdown_rx = shutdown_tx.subscribe(); + tokio::spawn(async move { + serve_https( + admin_listener, + admin_router, + tls_acceptor, + http_idle_timeout, + async move { + let _ = shutdown_rx.recv().await; + }, + ) + .await; + }) + } else { + let mut shutdown_rx = shutdown_tx.subscribe(); + tokio::spawn(async move { + serve_http( + admin_listener, + admin_router, + http_idle_timeout, + async move { + let _ = shutdown_rx.recv().await; + }, + ) + .await; + }) + }; + + let grpc_handle = { + let grpc_state = state.clone(); + let grpc_addr = grpc_addr.clone(); + let grpc_tls_acceptor = grpc_tls_acceptor.clone(); + let mut shutdown_rx = shutdown_tx.subscribe(); + tokio::spawn(async move { + let addr = grpc_addr.parse().expect("Invalid gRPC address"); + let grpc_service = VolumeGrpcService { + state: grpc_state.clone(), + }; + if let Some(tls_acceptor) = grpc_tls_acceptor { + let listener = tokio::net::TcpListener::bind(&grpc_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind gRPC to {}: {}", grpc_addr, e)); + let incoming = grpc_tls_incoming(listener, tls_acceptor); + let reflection_v1 = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1() + .expect("Failed to build gRPC reflection v1 service"); + let reflection_v1alpha = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1alpha() + .expect("Failed to build gRPC reflection v1alpha service"); + info!("gRPC server listening on {} (TLS enabled)", addr); + if let Err(e) = build_grpc_server_builder() + .layer(GrpcRequestIdLayer) + .add_service(reflection_v1) + .add_service(reflection_v1alpha) + .add_service(build_volume_grpc_service(grpc_service)) + .serve_with_incoming_shutdown(incoming, async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("gRPC server error: {}", e); + } + } else { + let reflection_v1 = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1() + .expect("Failed to build gRPC reflection v1 service"); + let reflection_v1alpha = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1alpha() + .expect("Failed to build gRPC reflection v1alpha service"); + info!("gRPC server listening on {}", addr); + if let Err(e) = build_grpc_server_builder() + .layer(GrpcRequestIdLayer) + .add_service(reflection_v1) + .add_service(reflection_v1alpha) + .add_service(build_volume_grpc_service(grpc_service)) + .serve_with_shutdown(addr, async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("gRPC server error: {}", e); + } + } + }) + }; + + // Spawn heartbeat to master (if master addresses are configured) + let heartbeat_handle = { + let master_addrs = config.masters.clone(); + if !master_addrs.is_empty() { + let hb_config = seaweed_volume::server::heartbeat::HeartbeatConfig { + ip: config.ip.clone(), + port: config.port, + grpc_port: config.grpc_port, + public_url: config.public_url.clone(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + master_addresses: master_addrs.clone(), + pulse_seconds: 5, + }; + let hb_shutdown = shutdown_tx.subscribe(); + let hb_state = state.clone(); + info!("Will send heartbeats to master: {:?}", master_addrs); + Some(tokio::spawn(async move { + seaweed_volume::server::heartbeat::run_heartbeat_with_state( + hb_config, + hb_state, + hb_shutdown, + ) + .await; + })) + } else { + None + } + }; + + let public_handle = if needs_public { + let public_router = + seaweed_volume::server::volume_server::build_public_router(state.clone()); + let public_addr = format!("{}:{}", config.bind_ip, public_port); + let listener = tokio::net::TcpListener::bind(&public_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind public HTTP to {}: {}", public_addr, e)); + info!("Public HTTP server listening on {}", public_addr); + let mut shutdown_rx = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + serve_http(listener, public_router, http_idle_timeout, async move { + let _ = shutdown_rx.recv().await; + }) + .await; + })) + } else { + None + }; + + let metrics_handle = if config.metrics_port > 0 { + let metrics_router = build_metrics_router(); + let metrics_addr = format!("{}:{}", config.metrics_ip, config.metrics_port); + info!("Metrics server listening on {}", metrics_addr); + let listener = tokio::net::TcpListener::bind(&metrics_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind metrics HTTP to {}: {}", metrics_addr, e)); + let mut shutdown_rx = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + if let Err(e) = axum::serve(listener, metrics_router) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("Metrics HTTP server error: {}", e); + } + })) + } else { + None + }; + + let debug_handle = if config.debug { + let debug_addr = format!("0.0.0.0:{}", config.debug_port); + info!("Debug pprof server listening on {}", debug_addr); + let listener = tokio::net::TcpListener::bind(&debug_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind debug HTTP to {}: {}", debug_addr, e)); + let debug_router = build_debug_router(); + let mut shutdown_rx = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + if let Err(e) = axum::serve(listener, debug_router) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("Debug HTTP server error: {}", e); + } + })) + } else { + None + }; + + let metrics_push_handle = { + let push_state = state.clone(); + let push_instance = format!("{}:{}", config.ip, config.port); + let push_shutdown = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + run_metrics_push_loop(push_state, push_instance, push_shutdown).await; + })) + }; + + // Wait for all servers + let _ = http_handle.await; + let _ = grpc_handle.await; + if let Some(h) = public_handle { + let _ = h.await; + } + if let Some(h) = metrics_handle { + let _ = h.await; + } + if let Some(h) = debug_handle { + let _ = h.await; + } + if let Some(h) = heartbeat_handle { + let _ = h.await; + } + if let Some(h) = metrics_push_handle { + let _ = h.await; + } + + // Close all volumes (flush and release file handles) matching Go's Shutdown() + state.store.write().unwrap().close(); + + if let Some(cpu_profile) = cpu_profile { + cpu_profile.finish().map_err(std::io::Error::other)?; + } + + info!("Volume server stopped."); + Ok(()) +} + +async fn run_metrics_push_loop( + state: Arc, + instance: String, + mut shutdown_rx: tokio::sync::broadcast::Receiver<()>, +) { + loop { + let push_cfg = { state.metrics_runtime.read().unwrap().push_gateway.clone() }; + + if push_cfg.address.is_empty() || push_cfg.interval_seconds == 0 { + tokio::select! { + _ = state.metrics_notify.notified() => continue, + _ = shutdown_rx.recv() => return, + } + } + + if let Err(e) = metrics::push_metrics_once( + &state.http_client, + &push_cfg.address, + "volumeServer", + &instance, + ) + .await + { + info!("could not push metrics to {}: {}", push_cfg.address, e); + } + + let interval = std::time::Duration::from_secs(push_cfg.interval_seconds.max(1) as u64); + tokio::select! { + _ = tokio::time::sleep(interval) => {} + _ = state.metrics_notify.notified() => {} + _ = shutdown_rx.recv() => return, + } + } +} + +fn grpc_tls_incoming( + listener: tokio::net::TcpListener, + tls_acceptor: TlsAcceptor, +) -> impl tokio_stream::Stream< + Item = Result, std::io::Error>, +> { + async_stream::stream! { + loop { + match listener.accept().await { + Ok((tcp_stream, remote_addr)) => match tls_acceptor.accept(tcp_stream).await { + Ok(tls_stream) => yield Ok(tls_stream), + Err(e) => { + tracing::debug!("gRPC TLS handshake failed from {}: {}", remote_addr, e); + } + }, + Err(e) => { + yield Err(e); + break; + } + } + } + } +} + +/// Serve an axum Router over TLS using tokio-rustls. +/// Accepts TCP connections, performs TLS handshake, then serves HTTP over the encrypted stream. +async fn serve_http( + tcp_listener: tokio::net::TcpListener, + app: axum::Router, + idle_timeout: std::time::Duration, + shutdown_signal: F, +) where + F: std::future::Future + Send + 'static, +{ + use hyper_util::rt::{TokioExecutor, TokioIo}; + use hyper_util::server::conn::auto::Builder as HttpBuilder; + use hyper_util::service::TowerToHyperService; + use tower::Service; + + let mut make_svc = app.into_make_service_with_connect_info::(); + + tokio::pin!(shutdown_signal); + + loop { + tokio::select! { + _ = &mut shutdown_signal => { + info!("HTTP server shutting down"); + break; + } + result = tcp_listener.accept() => { + match result { + Ok((tcp_stream, remote_addr)) => { + let tower_svc = make_svc.call(remote_addr).await.expect("infallible"); + let hyper_svc = TowerToHyperService::new(tower_svc); + tokio::spawn(async move { + let io = TokioIo::new(apply_idle_timeout(tcp_stream, idle_timeout)); + let builder = HttpBuilder::new(TokioExecutor::new()); + if let Err(e) = builder.serve_connection(io, hyper_svc).await { + tracing::debug!("HTTP connection error: {}", e); + } + }); + } + Err(e) => { + error!("Failed to accept TCP connection: {}", e); + } + } + } + } + } +} + +async fn serve_https( + tcp_listener: tokio::net::TcpListener, + app: axum::Router, + tls_acceptor: TlsAcceptor, + idle_timeout: std::time::Duration, + shutdown_signal: F, +) where + F: std::future::Future + Send + 'static, +{ + use hyper_util::rt::{TokioExecutor, TokioIo}; + use hyper_util::server::conn::auto::Builder as HttpBuilder; + use hyper_util::service::TowerToHyperService; + use tower::Service; + + let mut make_svc = app.into_make_service_with_connect_info::(); + + tokio::pin!(shutdown_signal); + + loop { + tokio::select! { + _ = &mut shutdown_signal => { + info!("HTTPS server shutting down"); + break; + } + result = tcp_listener.accept() => { + match result { + Ok((tcp_stream, remote_addr)) => { + let tls_acceptor = tls_acceptor.clone(); + let tower_svc = make_svc.call(remote_addr).await.expect("infallible"); + let hyper_svc = TowerToHyperService::new(tower_svc); + tokio::spawn(async move { + match tls_acceptor.accept(tcp_stream).await { + Ok(tls_stream) => { + let io = TokioIo::new(apply_idle_timeout(tls_stream, idle_timeout)); + let builder = HttpBuilder::new(TokioExecutor::new()); + if let Err(e) = builder.serve_connection(io, hyper_svc).await { + tracing::debug!("HTTPS connection error: {}", e); + } + } + Err(e) => { + tracing::debug!("TLS handshake failed: {}", e); + } + } + }); + } + Err(e) => { + error!("Failed to accept TCP connection: {}", e); + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::{ + build_grpc_server_tls_acceptor, effective_http_tls_policy, tls_policy_is_configured, + }; + use seaweed_volume::security::tls::TlsPolicy; + + fn write_pem(dir: &tempfile::TempDir, name: &str, body: &str) -> String { + let path = dir.path().join(name); + std::fs::write(&path, body).unwrap(); + path.to_string_lossy().into_owned() + } + + #[test] + fn test_grpc_server_tls_requires_ca() { + let dir = tempfile::tempdir().unwrap(); + let cert = write_pem( + &dir, + "server.crt", + "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n", + ); + let key = write_pem( + &dir, + "server.key", + "-----BEGIN PRIVATE KEY-----\nZmFrZQ==\n-----END PRIVATE KEY-----\n", + ); + + assert!( + build_grpc_server_tls_acceptor(&cert, &key, "", &TlsPolicy::default(), "", &[]) + .is_none() + ); + } + + #[test] + fn test_grpc_server_tls_returns_none_when_files_are_missing() { + assert!(build_grpc_server_tls_acceptor( + "/missing/server.crt", + "/missing/server.key", + "/missing/ca.crt", + &TlsPolicy::default(), + "", + &[], + ) + .is_none()); + } + + #[test] + fn test_grpc_server_tls_disables_on_unsupported_tls_policy() { + let dir = tempfile::tempdir().unwrap(); + let cert = write_pem( + &dir, + "server.crt", + "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n", + ); + let key = write_pem( + &dir, + "server.key", + "-----BEGIN PRIVATE KEY-----\nZmFrZQ==\n-----END PRIVATE KEY-----\n", + ); + let ca = write_pem( + &dir, + "ca.crt", + "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n", + ); + + assert!(build_grpc_server_tls_acceptor( + &cert, + &key, + &ca, + &TlsPolicy { + min_version: "TLS 1.0".to_string(), + max_version: "TLS 1.1".to_string(), + cipher_suites: String::new(), + }, + "", + &[], + ) + .is_none()); + } + + #[test] + fn test_effective_http_tls_policy_ignores_tls_policy_without_ca() { + let configured = TlsPolicy { + min_version: "TLS 1.3".to_string(), + max_version: "TLS 1.3".to_string(), + cipher_suites: "TLS_AES_128_GCM_SHA256".to_string(), + }; + assert_eq!( + effective_http_tls_policy("", &configured), + TlsPolicy::default() + ); + assert_eq!( + effective_http_tls_policy("/etc/seaweedfs/http-ca.pem", &configured), + configured + ); + } + + #[test] + fn test_tls_policy_is_configured_detects_non_empty_fields() { + assert!(!tls_policy_is_configured(&TlsPolicy::default())); + assert!(tls_policy_is_configured(&TlsPolicy { + min_version: "TLS 1.2".to_string(), + max_version: String::new(), + cipher_suites: String::new(), + })); + } +} diff --git a/seaweed-volume/src/metrics.rs b/seaweed-volume/src/metrics.rs new file mode 100644 index 000000000..572786949 --- /dev/null +++ b/seaweed-volume/src/metrics.rs @@ -0,0 +1,448 @@ +//! Prometheus metrics for the volume server. +//! +//! Mirrors the Go SeaweedFS volume server metrics. + +use prometheus::{ + self, Encoder, GaugeVec, HistogramOpts, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, + Opts, Registry, TextEncoder, +}; +use std::sync::Once; + +use crate::version; + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct PushGatewayConfig { + pub address: String, + pub interval_seconds: u32, +} + +lazy_static::lazy_static! { + pub static ref REGISTRY: Registry = Registry::new(); + + // ---- Request metrics (Go: VolumeServerRequestCounter, VolumeServerRequestHistogram) ---- + + /// Request counter with labels `type` (HTTP method) and `code` (HTTP status). + pub static ref REQUEST_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_request_total", "Volume server requests"), + &["type", "code"], + ).expect("metric can be created"); + + /// Request duration histogram with label `type` (HTTP method). + pub static ref REQUEST_DURATION: HistogramVec = HistogramVec::new( + HistogramOpts::new( + "SeaweedFS_volumeServer_request_seconds", + "Volume server request duration in seconds", + ).buckets(exponential_buckets(0.0001, 2.0, 24)), + &["type"], + ).expect("metric can be created"); + + // ---- Handler counters (Go: VolumeServerHandlerCounter) ---- + + /// Handler-level operation counter with label `type`. + pub static ref HANDLER_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_handler_total", "Volume server handler counters"), + &["type"], + ).expect("metric can be created"); + + // ---- Vacuuming metrics (Go: VolumeServerVacuuming*) ---- + + /// Vacuuming compact counter with label `success` (true/false). + pub static ref VACUUMING_COMPACT_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_vacuuming_compact_count", "Counter of volume vacuuming Compact counter"), + &["success"], + ).expect("metric can be created"); + + /// Vacuuming commit counter with label `success` (true/false). + pub static ref VACUUMING_COMMIT_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_vacuuming_commit_count", "Counter of volume vacuuming commit counter"), + &["success"], + ).expect("metric can be created"); + + /// Vacuuming duration histogram with label `type` (compact/commit). + pub static ref VACUUMING_HISTOGRAM: HistogramVec = HistogramVec::new( + HistogramOpts::new( + "SeaweedFS_volumeServer_vacuuming_seconds", + "Volume vacuuming duration in seconds", + ).buckets(exponential_buckets(0.0001, 2.0, 24)), + &["type"], + ).expect("metric can be created"); + + // ---- Volume gauges (Go: VolumeServerVolumeGauge, VolumeServerReadOnlyVolumeGauge) ---- + + /// Volumes per collection and type (volume/ec_shards). + pub static ref VOLUME_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_volumes", "Number of volumes"), + &["collection", "type"], + ).expect("metric can be created"); + + /// Read-only volumes per collection and type. + pub static ref READ_ONLY_VOLUME_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_read_only_volumes", "Number of read-only volumes."), + &["collection", "type"], + ).expect("metric can be created"); + + /// Maximum number of volumes this server can hold. + pub static ref MAX_VOLUMES: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_max_volumes", + "Maximum number of volumes", + ).expect("metric can be created"); + + // ---- Disk size gauges (Go: VolumeServerDiskSizeGauge) ---- + + /// Actual disk size used by volumes per collection and type (normal/deleted_bytes/ec). + pub static ref DISK_SIZE_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_total_disk_size", "Actual disk size used by volumes"), + &["collection", "type"], + ).expect("metric can be created"); + + // ---- Resource gauges (Go: VolumeServerResourceGauge) ---- + + /// Disk resource usage per directory and type (all/used/free/avail). + pub static ref RESOURCE_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_resource", "Server resource usage"), + &["name", "type"], + ).expect("metric can be created"); + + // ---- In-flight gauges (Go: VolumeServerInFlightRequestsGauge, InFlightDownload/UploadSize) ---- + + /// In-flight requests per HTTP method. + pub static ref INFLIGHT_REQUESTS_GAUGE: IntGaugeVec = IntGaugeVec::new( + Opts::new("SeaweedFS_volumeServer_in_flight_requests", "Current number of in-flight requests being handled by volume server."), + &["type"], + ).expect("metric can be created"); + + /// Concurrent download limit in bytes. + pub static ref CONCURRENT_DOWNLOAD_LIMIT: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_concurrent_download_limit", + "Limit for total concurrent download size in bytes", + ).expect("metric can be created"); + + /// Concurrent upload limit in bytes. + pub static ref CONCURRENT_UPLOAD_LIMIT: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_concurrent_upload_limit", + "Limit for total concurrent upload size in bytes", + ).expect("metric can be created"); + + /// Current in-flight download bytes. + pub static ref INFLIGHT_DOWNLOAD_SIZE: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_in_flight_download_size", + "In flight total download size.", + ).expect("metric can be created"); + + /// Current in-flight upload bytes. + pub static ref INFLIGHT_UPLOAD_SIZE: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_in_flight_upload_size", + "In flight total upload size.", + ).expect("metric can be created"); + + // ---- Legacy aliases for backward compat with existing code ---- + + /// Total number of volumes on this server (flat gauge). + pub static ref VOLUMES_TOTAL: IntGauge = IntGauge::new( + "volume_server_volumes_total", + "Total number of volumes", + ).expect("metric can be created"); + + /// Disk size in bytes per directory. + pub static ref DISK_SIZE_BYTES: IntGaugeVec = IntGaugeVec::new( + Opts::new("volume_server_disk_size_bytes", "Disk size in bytes"), + &["dir"], + ).expect("metric can be created"); + + /// Disk free bytes per directory. + pub static ref DISK_FREE_BYTES: IntGaugeVec = IntGaugeVec::new( + Opts::new("volume_server_disk_free_bytes", "Disk free space in bytes"), + &["dir"], + ).expect("metric can be created"); + + /// Current number of in-flight requests (flat gauge). + pub static ref INFLIGHT_REQUESTS: IntGauge = IntGauge::new( + "volume_server_inflight_requests", + "Current number of in-flight requests", + ).expect("metric can be created"); + + /// Total number of files stored across all volumes. + pub static ref VOLUME_FILE_COUNT: IntGauge = IntGauge::new( + "volume_server_volume_file_count", + "Total number of files stored across all volumes", + ).expect("metric can be created"); + + // ---- Build info (Go: BuildInfo) ---- + + /// Build information gauge, always set to 1. Matches Go: + /// Namespace="SeaweedFS", Subsystem="build", Name="info", + /// labels: version, commit, sizelimit, goos, goarch. + pub static ref BUILD_INFO: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_build_info", "A metric with a constant '1' value labeled by version, commit, sizelimit, goos, and goarch from which SeaweedFS was built."), + &["version", "commit", "sizelimit", "goos", "goarch"], + ).expect("metric can be created"); +} + +/// Generate exponential bucket boundaries for histograms. +fn exponential_buckets(start: f64, factor: f64, count: usize) -> Vec { + let mut buckets = Vec::with_capacity(count); + let mut val = start; + for _ in 0..count { + buckets.push(val); + val *= factor; + } + buckets +} + +// Handler counter type constants (matches Go's metrics_names.go). +pub const WRITE_TO_LOCAL_DISK: &str = "writeToLocalDisk"; +pub const WRITE_TO_REPLICAS: &str = "writeToReplicas"; +pub const DOWNLOAD_LIMIT_COND: &str = "downloadLimitCondition"; +pub const UPLOAD_LIMIT_COND: &str = "uploadLimitCondition"; +pub const READ_PROXY_REQ: &str = "readProxyRequest"; +pub const READ_REDIRECT_REQ: &str = "readRedirectRequest"; +pub const EMPTY_READ_PROXY_LOC: &str = "emptyReadProxyLocaction"; +pub const FAILED_READ_PROXY_REQ: &str = "failedReadProxyRequest"; + +// Error metric name constants. +pub const ERROR_SIZE_MISMATCH_OFFSET_SIZE: &str = "errorSizeMismatchOffsetSize"; +pub const ERROR_SIZE_MISMATCH: &str = "errorSizeMismatch"; +pub const ERROR_CRC: &str = "errorCRC"; +pub const ERROR_INDEX_OUT_OF_RANGE: &str = "errorIndexOutOfRange"; +pub const ERROR_GET_NOT_FOUND: &str = "errorGetNotFound"; +pub const ERROR_GET_INTERNAL: &str = "errorGetInternal"; +pub const ERROR_WRITE_TO_LOCAL_DISK: &str = "errorWriteToLocalDisk"; +pub const ERROR_UNMARSHAL_PAIRS: &str = "errorUnmarshalPairs"; +pub const ERROR_WRITE_TO_REPLICAS: &str = "errorWriteToReplicas"; + +// Go volume heartbeat metric label values. +pub const READ_ONLY_LABEL_IS_READ_ONLY: &str = "IsReadOnly"; +pub const READ_ONLY_LABEL_NO_WRITE_OR_DELETE: &str = "noWriteOrDelete"; +pub const READ_ONLY_LABEL_NO_WRITE_CAN_DELETE: &str = "noWriteCanDelete"; +pub const READ_ONLY_LABEL_IS_DISK_SPACE_LOW: &str = "isDiskSpaceLow"; +pub const DISK_SIZE_LABEL_NORMAL: &str = "normal"; +pub const DISK_SIZE_LABEL_DELETED_BYTES: &str = "deleted_bytes"; +pub const DISK_SIZE_LABEL_EC: &str = "ec"; + +static REGISTER_METRICS: Once = Once::new(); + +/// Register all metrics with the custom registry. +/// Call this once at startup. +pub fn register_metrics() { + REGISTER_METRICS.call_once(|| { + let metrics: Vec> = vec![ + // New Go-compatible metrics + Box::new(REQUEST_COUNTER.clone()), + Box::new(REQUEST_DURATION.clone()), + Box::new(HANDLER_COUNTER.clone()), + Box::new(VACUUMING_COMPACT_COUNTER.clone()), + Box::new(VACUUMING_COMMIT_COUNTER.clone()), + Box::new(VACUUMING_HISTOGRAM.clone()), + Box::new(VOLUME_GAUGE.clone()), + Box::new(READ_ONLY_VOLUME_GAUGE.clone()), + Box::new(MAX_VOLUMES.clone()), + Box::new(DISK_SIZE_GAUGE.clone()), + Box::new(RESOURCE_GAUGE.clone()), + Box::new(INFLIGHT_REQUESTS_GAUGE.clone()), + Box::new(CONCURRENT_DOWNLOAD_LIMIT.clone()), + Box::new(CONCURRENT_UPLOAD_LIMIT.clone()), + Box::new(INFLIGHT_DOWNLOAD_SIZE.clone()), + Box::new(INFLIGHT_UPLOAD_SIZE.clone()), + // Legacy metrics + Box::new(VOLUMES_TOTAL.clone()), + Box::new(DISK_SIZE_BYTES.clone()), + Box::new(DISK_FREE_BYTES.clone()), + Box::new(INFLIGHT_REQUESTS.clone()), + Box::new(VOLUME_FILE_COUNT.clone()), + // Build info + Box::new(BUILD_INFO.clone()), + ]; + for m in metrics { + REGISTRY.register(m).expect("metric registered"); + } + + // Set build info gauge to 1 with version/commit/sizelimit/os/arch labels (matches Go). + BUILD_INFO + .with_label_values(&[ + version::version(), + version::commit(), + version::size_limit(), + std::env::consts::OS, + std::env::consts::ARCH, + ]) + .set(1.0); + }); +} + +/// Gather all metrics and encode them in Prometheus text exposition format. +pub fn gather_metrics() -> String { + let encoder = TextEncoder::new(); + let metric_families = REGISTRY.gather(); + let mut buffer = Vec::new(); + encoder + .encode(&metric_families, &mut buffer) + .expect("encoding metrics"); + String::from_utf8(buffer).expect("metrics are valid UTF-8") +} + +pub fn delete_collection_metrics(collection: &str) { + // Mirrors Go's DeletePartialMatch(prometheus.Labels{"collection": collection}) + // which removes ALL metric entries matching the collection label, regardless + // of other label values (like "type"). We gather the metric families to discover + // all type values dynamically, matching Go's partial-match behavior. + delete_partial_match_collection(&VOLUME_GAUGE, collection); + delete_partial_match_collection(&READ_ONLY_VOLUME_GAUGE, collection); + delete_partial_match_collection(&DISK_SIZE_GAUGE, collection); +} + +/// Remove all metric entries from a GaugeVec where the "collection" label matches. +/// This emulates Go's `DeletePartialMatch(prometheus.Labels{"collection": collection})`. +fn delete_partial_match_collection(gauge: &GaugeVec, collection: &str) { + use prometheus::core::Collector; + let families = gauge.collect(); + for family in &families { + for metric in family.get_metric() { + let labels = metric.get_label(); + let mut matches_collection = false; + let mut type_value = None; + for label in labels { + if label.get_name() == "collection" && label.get_value() == collection { + matches_collection = true; + } + if label.get_name() == "type" { + type_value = Some(label.get_value().to_string()); + } + } + if matches_collection { + if let Some(ref tv) = type_value { + let _ = gauge.remove_label_values(&[collection, tv]); + } + } + } + } +} + +pub fn build_pushgateway_url(address: &str, job: &str, instance: &str) -> String { + let base = if address.starts_with("http://") || address.starts_with("https://") { + address.to_string() + } else { + format!("http://{}", address) + }; + let base = base.trim_end_matches('/'); + format!("{}/metrics/job/{}/instance/{}", base, job, instance) +} + +pub async fn push_metrics_once( + client: &reqwest::Client, + address: &str, + job: &str, + instance: &str, +) -> Result<(), String> { + let url = build_pushgateway_url(address, job, instance); + let response = client + .put(&url) + .header( + reqwest::header::CONTENT_TYPE, + "text/plain; version=0.0.4; charset=utf-8", + ) + .body(gather_metrics()) + .send() + .await + .map_err(|e| format!("push metrics request failed: {}", e))?; + + if response.status().is_success() { + Ok(()) + } else { + Err(format!( + "push metrics failed with status {}", + response.status() + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::{routing::put, Router}; + use std::sync::{Arc, Mutex}; + + #[test] + fn test_gather_metrics_returns_text() { + register_metrics(); + REQUEST_COUNTER.with_label_values(&["GET", "200"]).inc(); + let output = gather_metrics(); + assert!(output.contains("SeaweedFS_volumeServer_request_total")); + } + + #[test] + fn test_build_pushgateway_url() { + assert_eq!( + build_pushgateway_url("localhost:9091", "volumeServer", "test-instance"), + "http://localhost:9091/metrics/job/volumeServer/instance/test-instance" + ); + assert_eq!( + build_pushgateway_url("https://push.example", "volumeServer", "node-a"), + "https://push.example/metrics/job/volumeServer/instance/node-a" + ); + } + + #[tokio::test] + async fn test_push_metrics_once() { + register_metrics(); + + let captured = Arc::new(Mutex::new(None::)); + let captured_clone = captured.clone(); + + let app = Router::new().route( + "/metrics/job/volumeServer/instance/test-instance", + put(move |body: String| { + let captured = captured_clone.clone(); + async move { + *captured.lock().unwrap() = Some(body); + "ok" + } + }), + ); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let server = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + + let client = reqwest::Client::new(); + push_metrics_once( + &client, + &format!("127.0.0.1:{}", addr.port()), + "volumeServer", + "test-instance", + ) + .await + .unwrap(); + + let body = captured.lock().unwrap().clone().unwrap(); + assert!(body.contains("SeaweedFS_volumeServer_request_total")); + + server.abort(); + } + + #[test] + fn test_delete_collection_metrics_removes_collection_labelsets() { + register_metrics(); + + VOLUME_GAUGE.with_label_values(&["pics", "volume"]).set(2.0); + VOLUME_GAUGE.with_label_values(&["pics", "ec_shards"]).set(3.0); + READ_ONLY_VOLUME_GAUGE + .with_label_values(&["pics", "volume"]) + .set(1.0); + DISK_SIZE_GAUGE + .with_label_values(&["pics", "normal"]) + .set(10.0); + DISK_SIZE_GAUGE + .with_label_values(&["pics", "deleted_bytes"]) + .set(4.0); + + delete_collection_metrics("pics"); + + let output = gather_metrics(); + assert!(!output.contains("collection=\"pics\",type=\"volume\"")); + assert!(!output.contains("collection=\"pics\",type=\"ec_shards\"")); + assert!(!output.contains("collection=\"pics\",type=\"normal\"")); + assert!(!output.contains("collection=\"pics\",type=\"deleted_bytes\"")); + } +} diff --git a/seaweed-volume/src/remote_storage/mod.rs b/seaweed-volume/src/remote_storage/mod.rs new file mode 100644 index 000000000..599333ede --- /dev/null +++ b/seaweed-volume/src/remote_storage/mod.rs @@ -0,0 +1,157 @@ +//! Remote storage backends for tiered storage support. +//! +//! Provides a trait-based abstraction over cloud storage providers (S3, GCS, Azure, etc.) +//! and a registry to create clients from protobuf RemoteConf messages. + +pub mod s3; +pub mod s3_tier; + +use crate::pb::remote_pb::{RemoteConf, RemoteStorageLocation}; + +/// Error type for remote storage operations. +#[derive(Debug, thiserror::Error)] +pub enum RemoteStorageError { + #[error("remote storage type {0} not found")] + TypeNotFound(String), + #[error("remote object not found: {0}")] + ObjectNotFound(String), + #[error("remote storage error: {0}")] + Other(String), + #[error("io error: {0}")] + Io(#[from] std::io::Error), +} + +/// Metadata about a remote file entry. +#[derive(Debug, Clone)] +pub struct RemoteEntry { + pub size: i64, + pub last_modified_at: i64, // Unix seconds + pub e_tag: String, + pub storage_name: String, +} + +/// Trait for remote storage clients. Matches Go's RemoteStorageClient interface. +#[async_trait::async_trait] +pub trait RemoteStorageClient: Send + Sync { + /// Read (part of) a file from remote storage. + async fn read_file( + &self, + loc: &RemoteStorageLocation, + offset: i64, + size: i64, + ) -> Result, RemoteStorageError>; + + /// Write a file to remote storage. + async fn write_file( + &self, + loc: &RemoteStorageLocation, + data: &[u8], + ) -> Result; + + /// Get metadata for a file in remote storage. + async fn stat_file( + &self, + loc: &RemoteStorageLocation, + ) -> Result; + + /// Delete a file from remote storage. + async fn delete_file(&self, loc: &RemoteStorageLocation) -> Result<(), RemoteStorageError>; + + /// List all buckets. + async fn list_buckets(&self) -> Result, RemoteStorageError>; + + /// The RemoteConf used to create this client. + fn remote_conf(&self) -> &RemoteConf; +} + +/// Create a new remote storage client from a RemoteConf. +pub fn make_remote_storage_client( + conf: &RemoteConf, +) -> Result, RemoteStorageError> { + match conf.r#type.as_str() { + // All S3-compatible backends use the same client with different credentials + "s3" | "wasabi" | "backblaze" | "aliyun" | "tencent" | "baidu" | "filebase" | "storj" + | "contabo" => { + let (access_key, secret_key, endpoint, region) = extract_s3_credentials(conf); + Ok(Box::new(s3::S3RemoteStorageClient::new( + conf.clone(), + &access_key, + &secret_key, + ®ion, + &endpoint, + conf.s3_force_path_style, + ))) + } + other => Err(RemoteStorageError::TypeNotFound(other.to_string())), + } +} + +/// Extract S3-compatible credentials from a RemoteConf based on its type. +fn extract_s3_credentials(conf: &RemoteConf) -> (String, String, String, String) { + match conf.r#type.as_str() { + "s3" => ( + conf.s3_access_key.clone(), + conf.s3_secret_key.clone(), + conf.s3_endpoint.clone(), + if conf.s3_region.is_empty() { + "us-east-1".to_string() + } else { + conf.s3_region.clone() + }, + ), + "wasabi" => ( + conf.wasabi_access_key.clone(), + conf.wasabi_secret_key.clone(), + conf.wasabi_endpoint.clone(), + conf.wasabi_region.clone(), + ), + "backblaze" => ( + conf.backblaze_key_id.clone(), + conf.backblaze_application_key.clone(), + conf.backblaze_endpoint.clone(), + conf.backblaze_region.clone(), + ), + "aliyun" => ( + conf.aliyun_access_key.clone(), + conf.aliyun_secret_key.clone(), + conf.aliyun_endpoint.clone(), + conf.aliyun_region.clone(), + ), + "tencent" => ( + conf.tencent_secret_id.clone(), + conf.tencent_secret_key.clone(), + conf.tencent_endpoint.clone(), + String::new(), + ), + "baidu" => ( + conf.baidu_access_key.clone(), + conf.baidu_secret_key.clone(), + conf.baidu_endpoint.clone(), + conf.baidu_region.clone(), + ), + "filebase" => ( + conf.filebase_access_key.clone(), + conf.filebase_secret_key.clone(), + conf.filebase_endpoint.clone(), + String::new(), + ), + "storj" => ( + conf.storj_access_key.clone(), + conf.storj_secret_key.clone(), + conf.storj_endpoint.clone(), + String::new(), + ), + "contabo" => ( + conf.contabo_access_key.clone(), + conf.contabo_secret_key.clone(), + conf.contabo_endpoint.clone(), + conf.contabo_region.clone(), + ), + _ => ( + conf.s3_access_key.clone(), + conf.s3_secret_key.clone(), + conf.s3_endpoint.clone(), + conf.s3_region.clone(), + ), + } +} diff --git a/seaweed-volume/src/remote_storage/s3.rs b/seaweed-volume/src/remote_storage/s3.rs new file mode 100644 index 000000000..bac5485ae --- /dev/null +++ b/seaweed-volume/src/remote_storage/s3.rs @@ -0,0 +1,186 @@ +//! S3-compatible remote storage client. +//! +//! Works with AWS S3, MinIO, SeaweedFS S3, and all S3-compatible providers. + +use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region}; +use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::Client; + +use super::{RemoteEntry, RemoteStorageClient, RemoteStorageError}; +use crate::pb::remote_pb::{RemoteConf, RemoteStorageLocation}; + +/// S3-compatible remote storage client. +pub struct S3RemoteStorageClient { + client: Client, + conf: RemoteConf, +} + +impl S3RemoteStorageClient { + /// Create a new S3 client from credentials and endpoint configuration. + pub fn new( + conf: RemoteConf, + access_key: &str, + secret_key: &str, + region: &str, + endpoint: &str, + force_path_style: bool, + ) -> Self { + let region = if region.is_empty() { + "us-east-1" + } else { + region + }; + + let credentials = Credentials::new( + access_key, + secret_key, + None, // session token + None, // expiry + "seaweedfs-volume", + ); + + let mut s3_config = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(region.to_string())) + .credentials_provider(credentials) + .force_path_style(force_path_style); + + if !endpoint.is_empty() { + s3_config = s3_config.endpoint_url(endpoint); + } + + let client = Client::from_conf(s3_config.build()); + + S3RemoteStorageClient { client, conf } + } +} + +#[async_trait::async_trait] +impl RemoteStorageClient for S3RemoteStorageClient { + async fn read_file( + &self, + loc: &RemoteStorageLocation, + offset: i64, + size: i64, + ) -> Result, RemoteStorageError> { + let key = loc.path.trim_start_matches('/'); + + let mut req = self.client.get_object().bucket(&loc.bucket).key(key); + + // Set byte range if specified + if size > 0 { + let end = offset + size - 1; + req = req.range(format!("bytes={}-{}", offset, end)); + } else if offset > 0 { + req = req.range(format!("bytes={}-", offset)); + } + + let resp = req.send().await.map_err(|e| { + let msg = format!("{}", e); + if msg.contains("NoSuchKey") || msg.contains("404") { + RemoteStorageError::ObjectNotFound(format!("{}/{}", loc.bucket, key)) + } else { + RemoteStorageError::Other(format!("s3 get object: {}", e)) + } + })?; + + let data = resp + .body + .collect() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 read body: {}", e)))?; + + Ok(data.into_bytes().to_vec()) + } + + async fn write_file( + &self, + loc: &RemoteStorageLocation, + data: &[u8], + ) -> Result { + let key = loc.path.trim_start_matches('/'); + + let resp = self + .client + .put_object() + .bucket(&loc.bucket) + .key(key) + .body(ByteStream::from(data.to_vec())) + .send() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 put object: {}", e)))?; + + Ok(RemoteEntry { + size: data.len() as i64, + last_modified_at: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64, + e_tag: resp.e_tag().unwrap_or_default().to_string(), + storage_name: loc.name.clone(), + }) + } + + async fn stat_file( + &self, + loc: &RemoteStorageLocation, + ) -> Result { + let key = loc.path.trim_start_matches('/'); + + let resp = self + .client + .head_object() + .bucket(&loc.bucket) + .key(key) + .send() + .await + .map_err(|e| { + let msg = format!("{}", e); + if msg.contains("404") || msg.contains("NotFound") { + RemoteStorageError::ObjectNotFound(format!("{}/{}", loc.bucket, key)) + } else { + RemoteStorageError::Other(format!("s3 head object: {}", e)) + } + })?; + + Ok(RemoteEntry { + size: resp.content_length().unwrap_or(0), + last_modified_at: resp.last_modified().map(|t| t.secs()).unwrap_or(0), + e_tag: resp.e_tag().unwrap_or_default().to_string(), + storage_name: loc.name.clone(), + }) + } + + async fn delete_file(&self, loc: &RemoteStorageLocation) -> Result<(), RemoteStorageError> { + let key = loc.path.trim_start_matches('/'); + + self.client + .delete_object() + .bucket(&loc.bucket) + .key(key) + .send() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 delete object: {}", e)))?; + + Ok(()) + } + + async fn list_buckets(&self) -> Result, RemoteStorageError> { + let resp = self + .client + .list_buckets() + .send() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 list buckets: {}", e)))?; + + Ok(resp + .buckets() + .iter() + .filter_map(|b| b.name().map(String::from)) + .collect()) + } + + fn remote_conf(&self) -> &RemoteConf { + &self.conf + } +} diff --git a/seaweed-volume/src/remote_storage/s3_tier.rs b/seaweed-volume/src/remote_storage/s3_tier.rs new file mode 100644 index 000000000..be88adcf8 --- /dev/null +++ b/seaweed-volume/src/remote_storage/s3_tier.rs @@ -0,0 +1,514 @@ +//! S3-compatible tiered storage backend for volume .dat file upload/download. +//! +//! Provides multipart upload and concurrent download with progress callbacks, +//! matching the Go SeaweedFS S3 backend behavior. + +use std::collections::HashMap; +use std::future::Future; +use std::sync::{Arc, OnceLock, RwLock}; + +use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region}; +use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; +use aws_sdk_s3::Client; +use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio::sync::Semaphore; + +/// Concurrency limit for multipart upload/download (matches Go's s3manager). +const CONCURRENCY: usize = 5; + +/// Configuration for an S3 tier backend. +#[derive(Debug, Clone)] +pub struct S3TierConfig { + pub access_key: String, + pub secret_key: String, + pub region: String, + pub bucket: String, + pub endpoint: String, + pub storage_class: String, + pub force_path_style: bool, +} + +/// S3 tier backend for uploading/downloading volume .dat files. +pub struct S3TierBackend { + client: Client, + pub bucket: String, + pub storage_class: String, +} + +impl S3TierBackend { + /// Create a new S3 tier backend from configuration. + pub fn new(config: &S3TierConfig) -> Self { + let region = if config.region.is_empty() { + "us-east-1" + } else { + &config.region + }; + + let credentials = Credentials::new( + &config.access_key, + &config.secret_key, + None, + None, + "seaweedfs-volume-tier", + ); + + let mut s3_config = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(region.to_string())) + .credentials_provider(credentials) + .force_path_style(config.force_path_style); + + if !config.endpoint.is_empty() { + s3_config = s3_config.endpoint_url(&config.endpoint); + } + + let client = Client::from_conf(s3_config.build()); + + S3TierBackend { + client, + bucket: config.bucket.clone(), + storage_class: if config.storage_class.is_empty() { + "STANDARD_IA".to_string() + } else { + config.storage_class.clone() + }, + } + } + + /// Upload a local file to S3 using multipart upload with concurrent parts + /// and progress reporting. + /// + /// Returns (s3_key, file_size) on success. + /// The progress callback receives (bytes_uploaded, percentage). + /// Uses 64MB part size and 5 concurrent uploads (matches Go s3manager). + pub async fn upload_file( + &self, + file_path: &str, + progress_fn: F, + ) -> Result<(String, u64), String> + where + F: FnMut(i64, f32) + Send + Sync + 'static, + { + let key = uuid::Uuid::new_v4().to_string(); + + let metadata = tokio::fs::metadata(file_path) + .await + .map_err(|e| format!("failed to stat file {}: {}", file_path, e))?; + let file_size = metadata.len(); + + // Calculate part size: start at 64MB, scale up for very large files (matches Go) + let mut part_size: u64 = 64 * 1024 * 1024; + while part_size * 1000 < file_size { + part_size *= 4; + } + + // Initiate multipart upload + let create_resp = self + .client + .create_multipart_upload() + .bucket(&self.bucket) + .key(&key) + .storage_class( + self.storage_class + .parse() + .unwrap_or(aws_sdk_s3::types::StorageClass::StandardIa), + ) + .send() + .await + .map_err(|e| format!("failed to create multipart upload: {}", e))?; + + let upload_id = create_resp + .upload_id() + .ok_or_else(|| "no upload_id in multipart upload response".to_string())? + .to_string(); + + // Build list of (part_number, offset, size) for all parts + let mut parts_plan: Vec<(i32, u64, usize)> = Vec::new(); + let mut offset: u64 = 0; + let mut part_number: i32 = 1; + while offset < file_size { + let remaining = file_size - offset; + let this_part_size = std::cmp::min(part_size, remaining) as usize; + parts_plan.push((part_number, offset, this_part_size)); + offset += this_part_size as u64; + part_number += 1; + } + + // Upload parts concurrently with a semaphore limiting to CONCURRENCY + let semaphore = Arc::new(Semaphore::new(CONCURRENCY)); + let client = &self.client; + let bucket = &self.bucket; + let file_path_owned = file_path.to_string(); + let progress = Arc::new(std::sync::Mutex::new((0u64, progress_fn))); + + let mut handles = Vec::with_capacity(parts_plan.len()); + for (pn, off, size) in parts_plan { + let sem = semaphore.clone(); + let client = client.clone(); + let bucket = bucket.clone(); + let key = key.clone(); + let upload_id = upload_id.clone(); + let fp = file_path_owned.clone(); + let progress = progress.clone(); + + handles.push(tokio::spawn(async move { + let _permit = sem + .acquire() + .await + .map_err(|e| format!("semaphore error: {}", e))?; + + // Read this part's data from the file at the correct offset + let mut file = tokio::fs::File::open(&fp) + .await + .map_err(|e| format!("failed to open file {}: {}", fp, e))?; + file.seek(std::io::SeekFrom::Start(off)) + .await + .map_err(|e| format!("failed to seek to offset {}: {}", off, e))?; + let mut buf = vec![0u8; size]; + file.read_exact(&mut buf) + .await + .map_err(|e| format!("failed to read file at offset {}: {}", off, e))?; + + let upload_part_resp = client + .upload_part() + .bucket(&bucket) + .key(&key) + .upload_id(&upload_id) + .part_number(pn) + .body(buf.into()) + .send() + .await + .map_err(|e| { + format!("failed to upload part {} at offset {}: {}", pn, off, e) + })?; + + let e_tag = upload_part_resp.e_tag().unwrap_or_default().to_string(); + + // Report progress + { + let mut guard = progress.lock().unwrap(); + guard.0 += size as u64; + let uploaded = guard.0; + let pct = if file_size > 0 { + (uploaded as f32 * 100.0) / file_size as f32 + } else { + 100.0 + }; + (guard.1)(uploaded as i64, pct); + } + + Ok::<_, String>( + CompletedPart::builder() + .e_tag(e_tag) + .part_number(pn) + .build(), + ) + })); + } + + // Collect results, preserving part order + let mut completed_parts = Vec::with_capacity(handles.len()); + for handle in handles { + let part = handle + .await + .map_err(|e| format!("upload task panicked: {}", e))??; + completed_parts.push(part); + } + + // Complete multipart upload + let completed_upload = CompletedMultipartUpload::builder() + .set_parts(Some(completed_parts)) + .build(); + + self.client + .complete_multipart_upload() + .bucket(&self.bucket) + .key(&key) + .upload_id(&upload_id) + .multipart_upload(completed_upload) + .send() + .await + .map_err(|e| format!("failed to complete multipart upload: {}", e))?; + + Ok((key, file_size)) + } + + /// Download a file from S3 to a local path with concurrent range requests + /// and progress reporting. + /// + /// Returns the file size on success. + /// Uses 64MB part size and 5 concurrent downloads (matches Go s3manager). + pub async fn download_file( + &self, + dest_path: &str, + key: &str, + progress_fn: F, + ) -> Result + where + F: FnMut(i64, f32) + Send + Sync + 'static, + { + // Get file size first + let head_resp = self + .client + .head_object() + .bucket(&self.bucket) + .key(key) + .send() + .await + .map_err(|e| format!("failed to head object {}: {}", key, e))?; + + let file_size = head_resp.content_length().unwrap_or(0) as u64; + + // Pre-allocate file to full size so concurrent WriteAt-style writes work + { + let file = tokio::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dest_path) + .await + .map_err(|e| format!("failed to open dest file {}: {}", dest_path, e))?; + file.set_len(file_size) + .await + .map_err(|e| format!("failed to set file length: {}", e))?; + } + + let part_size: u64 = 64 * 1024 * 1024; + + // Build list of (offset, size) for all parts + let mut parts_plan: Vec<(u64, u64)> = Vec::new(); + let mut offset: u64 = 0; + while offset < file_size { + let remaining = file_size - offset; + let this_part_size = std::cmp::min(part_size, remaining); + parts_plan.push((offset, this_part_size)); + offset += this_part_size; + } + + // Download parts concurrently with a semaphore limiting to CONCURRENCY + let semaphore = Arc::new(Semaphore::new(CONCURRENCY)); + let client = &self.client; + let bucket = &self.bucket; + let dest_path_owned = dest_path.to_string(); + let key_owned = key.to_string(); + let progress = Arc::new(std::sync::Mutex::new((0u64, progress_fn))); + + let mut handles = Vec::with_capacity(parts_plan.len()); + for (off, size) in parts_plan { + let sem = semaphore.clone(); + let client = client.clone(); + let bucket = bucket.clone(); + let key = key_owned.clone(); + let dp = dest_path_owned.clone(); + let progress = progress.clone(); + + handles.push(tokio::spawn(async move { + let _permit = sem + .acquire() + .await + .map_err(|e| format!("semaphore error: {}", e))?; + + let end = off + size - 1; + let range = format!("bytes={}-{}", off, end); + + let get_resp = client + .get_object() + .bucket(&bucket) + .key(&key) + .range(&range) + .send() + .await + .map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?; + + let body = get_resp + .body + .collect() + .await + .map_err(|e| format!("failed to read body: {}", e))?; + let bytes = body.into_bytes(); + + // Write at the correct offset (like Go's WriteAt) + let mut file = tokio::fs::OpenOptions::new() + .write(true) + .open(&dp) + .await + .map_err(|e| format!("failed to open dest file {}: {}", dp, e))?; + file.seek(std::io::SeekFrom::Start(off)) + .await + .map_err(|e| format!("failed to seek to offset {}: {}", off, e))?; + file.write_all(&bytes) + .await + .map_err(|e| format!("failed to write to {}: {}", dp, e))?; + + // Report progress + { + let mut guard = progress.lock().unwrap(); + guard.0 += bytes.len() as u64; + let downloaded = guard.0; + let pct = if file_size > 0 { + (downloaded as f32 * 100.0) / file_size as f32 + } else { + 100.0 + }; + (guard.1)(downloaded as i64, pct); + } + + Ok::<_, String>(()) + })); + } + + // Wait for all download tasks + for handle in handles { + handle + .await + .map_err(|e| format!("download task panicked: {}", e))??; + } + + Ok(file_size) + } + + pub async fn read_range(&self, key: &str, offset: u64, size: usize) -> Result, String> { + let end = offset + (size as u64).saturating_sub(1); + let range = format!("bytes={}-{}", offset, end); + let resp = self + .client + .get_object() + .bucket(&self.bucket) + .key(key) + .range(&range) + .send() + .await + .map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?; + + let body = resp + .body + .collect() + .await + .map_err(|e| format!("failed to read object {} body: {}", key, e))?; + Ok(body.into_bytes().to_vec()) + } + + /// Delete a file from S3. + pub async fn delete_file(&self, key: &str) -> Result<(), String> { + self.client + .delete_object() + .bucket(&self.bucket) + .key(key) + .send() + .await + .map_err(|e| format!("failed to delete object {}: {}", key, e))?; + Ok(()) + } + + pub fn delete_file_blocking(&self, key: &str) -> Result<(), String> { + let client = self.client.clone(); + let bucket = self.bucket.clone(); + let key = key.to_string(); + block_on_tier_future(async move { + client + .delete_object() + .bucket(&bucket) + .key(&key) + .send() + .await + .map_err(|e| format!("failed to delete object {}: {}", key, e))?; + Ok(()) + }) + } + + pub fn read_range_blocking( + &self, + key: &str, + offset: u64, + size: usize, + ) -> Result, String> { + let client = self.client.clone(); + let bucket = self.bucket.clone(); + let key = key.to_string(); + block_on_tier_future(async move { + let end = offset + (size as u64).saturating_sub(1); + let range = format!("bytes={}-{}", offset, end); + let resp = client + .get_object() + .bucket(&bucket) + .key(&key) + .range(&range) + .send() + .await + .map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?; + + let body = resp + .body + .collect() + .await + .map_err(|e| format!("failed to read object {} body: {}", key, e))?; + Ok(body.into_bytes().to_vec()) + }) + } +} + +/// Parse a backend name like "s3" or "s3.default" into (backend_type, backend_id). +/// Matches Go's `BackendNameToTypeId`. +pub fn backend_name_to_type_id(backend_name: &str) -> (String, String) { + let parts: Vec<&str> = backend_name.split('.').collect(); + match parts.len() { + 1 => (backend_name.to_string(), "default".to_string()), + 2 => (parts[0].to_string(), parts[1].to_string()), + _ => (String::new(), String::new()), + } +} + +/// A registry of configured S3 tier backends, keyed by backend name (e.g., "s3.default"). +#[derive(Default)] +pub struct S3TierRegistry { + backends: HashMap>, +} + +impl S3TierRegistry { + pub fn new() -> Self { + Self { + backends: HashMap::new(), + } + } + + /// Register a backend with the given name. + pub fn register(&mut self, name: String, backend: S3TierBackend) { + self.backends.insert(name, Arc::new(backend)); + } + + /// Look up a backend by name. + pub fn get(&self, name: &str) -> Option> { + self.backends.get(name).cloned() + } + + /// List all registered backend names. + pub fn names(&self) -> Vec { + self.backends.keys().cloned().collect() + } + + pub fn clear(&mut self) { + self.backends.clear(); + } +} + +static GLOBAL_S3_TIER_REGISTRY: OnceLock> = OnceLock::new(); + +pub fn global_s3_tier_registry() -> &'static RwLock { + GLOBAL_S3_TIER_REGISTRY.get_or_init(|| RwLock::new(S3TierRegistry::new())) +} + +fn block_on_tier_future(future: F) -> Result +where + F: Future> + Send + 'static, + T: Send + 'static, +{ + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| format!("failed to build tokio runtime: {}", e))?; + runtime.block_on(future) + }) + .join() + .map_err(|_| "tier runtime thread panicked".to_string())? +} diff --git a/seaweed-volume/src/security.rs b/seaweed-volume/src/security.rs new file mode 100644 index 000000000..e33350926 --- /dev/null +++ b/seaweed-volume/src/security.rs @@ -0,0 +1,481 @@ +//! Security: JWT validation and IP whitelist checking. +//! +//! Matches Go's security/guard.go and security/jwt.go. +//! - Guard: combines whitelist IP checking with JWT token validation +//! - JWT: HS256 HMAC signing with file-id claims + +pub mod tls; + +use std::collections::HashSet; +use std::net::IpAddr; +use std::time::{SystemTime, UNIX_EPOCH}; + +use jsonwebtoken::{decode, encode, Algorithm, DecodingKey, EncodingKey, Header, Validation}; +use serde::{Deserialize, Serialize}; + +// ============================================================================ +// JWT Claims +// ============================================================================ + +/// Claims for volume server file access tokens. +/// Matches Go's `SeaweedFileIdClaims`. +#[derive(Debug, Serialize, Deserialize)] +pub struct FileIdClaims { + /// File ID this token grants access to (e.g., "3,01637037d6"). + #[serde(skip_serializing_if = "Option::is_none")] + pub fid: Option, + + /// Expiration time (Unix timestamp). + #[serde(skip_serializing_if = "Option::is_none")] + pub exp: Option, + + /// Not before (Unix timestamp). + #[serde(skip_serializing_if = "Option::is_none")] + pub nbf: Option, +} + +/// Signing key wrapper (empty = security disabled). +#[derive(Clone)] +pub struct SigningKey(pub Vec); + +impl SigningKey { + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn from_string(s: &str) -> Self { + SigningKey(s.as_bytes().to_vec()) + } +} + +/// Generate a JWT token for file access. +pub fn gen_jwt( + signing_key: &SigningKey, + expires_after_sec: i64, + file_id: &str, +) -> Result { + if signing_key.is_empty() { + return Err(JwtError::NoSigningKey); + } + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + let claims = FileIdClaims { + fid: Some(file_id.to_string()), + exp: if expires_after_sec > 0 { + Some(now + expires_after_sec as u64) + } else { + None + }, + nbf: None, + }; + + let token = encode( + &Header::new(Algorithm::HS256), + &claims, + &EncodingKey::from_secret(&signing_key.0), + )?; + + Ok(token) +} + +/// Decode and validate a JWT token. +pub fn decode_jwt(signing_key: &SigningKey, token: &str) -> Result { + if signing_key.is_empty() { + return Err(JwtError::NoSigningKey); + } + + let mut validation = Validation::new(Algorithm::HS256); + // Match Go behavior: tokens without exp are accepted (Go's jwt-go does not require exp) + // But if exp IS present, it must be valid (not expired). + validation.required_spec_claims.clear(); + validation.validate_exp = true; + // Go's jwt-go/v5 validates nbf when present + validation.validate_nbf = true; + validation.leeway = 0; + + let data = decode::( + token, + &DecodingKey::from_secret(&signing_key.0), + &validation, + )?; + + Ok(data.claims) +} + +// ============================================================================ +// Guard +// ============================================================================ + +/// Security guard: IP whitelist + JWT token validation. +pub struct Guard { + whitelist_ips: HashSet, + whitelist_cidrs: Vec<(IpAddr, u8)>, // (network, prefix_len) + pub signing_key: SigningKey, + pub expires_after_sec: i64, + pub read_signing_key: SigningKey, + pub read_expires_after_sec: i64, + /// Combined flag: true when whitelist is non-empty OR signing key is present. + /// Matches Go's `isWriteActive = !isEmptyWhiteList || len(SigningKey) != 0`. + is_write_active: bool, +} + +impl Guard { + pub fn new( + whitelist: &[String], + signing_key: SigningKey, + expires_after_sec: i64, + read_signing_key: SigningKey, + read_expires_after_sec: i64, + ) -> Self { + let mut guard = Guard { + whitelist_ips: HashSet::new(), + whitelist_cidrs: Vec::new(), + signing_key, + expires_after_sec, + read_signing_key, + read_expires_after_sec, + is_write_active: false, + }; + guard.update_whitelist(whitelist); + guard + } + + /// Update the IP whitelist. + pub fn update_whitelist(&mut self, entries: &[String]) { + self.whitelist_ips.clear(); + self.whitelist_cidrs.clear(); + + for entry in entries { + let entry = entry.trim(); + if entry.is_empty() { + continue; + } + if entry.contains('/') { + // CIDR range + if let Some((ip, prefix)) = parse_cidr(entry) { + self.whitelist_cidrs.push((ip, prefix)); + } else { + tracing::error!("Parse CIDR {} in whitelist failed", entry); + } + } else { + // Exact IP/hostname + self.whitelist_ips.insert(entry.to_string()); + } + } + + // Match Go: isWriteActive = !isEmptyWhiteList || len(SigningKey) != 0 + let is_empty_whitelist = self.whitelist_ips.is_empty() && self.whitelist_cidrs.is_empty(); + self.is_write_active = !is_empty_whitelist || !self.signing_key.is_empty(); + } + + /// Check if a remote IP is in the whitelist. + /// Returns true if write security is inactive (no whitelist and no signing key), + /// if the whitelist is empty, or if the IP matches. + pub fn check_whitelist(&self, remote_addr: &str) -> bool { + if !self.is_write_active { + return true; + } + if self.whitelist_ips.is_empty() && self.whitelist_cidrs.is_empty() { + return true; + } + + let host = extract_host(remote_addr); + + // Check exact match + if self.whitelist_ips.contains(&host) { + return true; + } + + // Check CIDR ranges + if let Ok(ip) = host.parse::() { + for &(ref network, prefix_len) in &self.whitelist_cidrs { + if ip_in_cidr(&ip, network, prefix_len) { + return true; + } + } + } + + false + } + + /// Check if a read signing key is configured. + pub fn has_read_signing_key(&self) -> bool { + !self.read_signing_key.is_empty() + } + + /// Validate a request's JWT token. + /// `is_write` determines which signing key to use. + /// Returns Ok(()) if valid, or if security is disabled. + pub fn check_jwt(&self, token: Option<&str>, is_write: bool) -> Result<(), JwtError> { + let key = if is_write { + &self.signing_key + } else { + &self.read_signing_key + }; + + if key.is_empty() { + return Ok(()); // Security disabled for this operation type + } + + let token = token.ok_or(JwtError::MissingToken)?; + decode_jwt(key, token)?; + Ok(()) + } + + /// Check JWT and validate the file ID claim matches. + pub fn check_jwt_for_file( + &self, + token: Option<&str>, + expected_fid: &str, + is_write: bool, + ) -> Result<(), JwtError> { + let key = if is_write { + &self.signing_key + } else { + &self.read_signing_key + }; + + if key.is_empty() { + return Ok(()); + } + + let token = token.ok_or(JwtError::MissingToken)?; + let claims = decode_jwt(key, token)?; + + match claims.fid { + None => { + return Err(JwtError::MissingFileIdClaim); + } + Some(ref fid) if fid != expected_fid => { + return Err(JwtError::FileIdMismatch { + expected: expected_fid.to_string(), + got: fid.to_string(), + }); + } + _ => {} + } + + Ok(()) + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Extract host from "host:port" or "[::1]:port" format. +fn extract_host(addr: &str) -> String { + // Handle IPv6 with brackets + if addr.starts_with('[') { + if let Some(end) = addr.find(']') { + return addr[1..end].to_string(); + } + } + // Handle host:port + if let Some(pos) = addr.rfind(':') { + return addr[..pos].to_string(); + } + addr.to_string() +} + +/// Parse CIDR notation "192.168.1.0/24" into (IpAddr, prefix_len). +fn parse_cidr(cidr: &str) -> Option<(IpAddr, u8)> { + let parts: Vec<&str> = cidr.split('/').collect(); + if parts.len() != 2 { + return None; + } + let ip: IpAddr = parts[0].parse().ok()?; + let prefix: u8 = parts[1].parse().ok()?; + Some((ip, prefix)) +} + +/// Check if an IP is within a CIDR range. +fn ip_in_cidr(ip: &IpAddr, network: &IpAddr, prefix_len: u8) -> bool { + match (ip, network) { + (IpAddr::V4(ip), IpAddr::V4(net)) => { + let ip_bits = u32::from(*ip); + let net_bits = u32::from(*net); + let mask = if prefix_len == 0 { + 0 + } else if prefix_len >= 32 { + u32::MAX + } else { + u32::MAX << (32 - prefix_len) + }; + (ip_bits & mask) == (net_bits & mask) + } + (IpAddr::V6(ip), IpAddr::V6(net)) => { + let ip_bits = u128::from(*ip); + let net_bits = u128::from(*net); + let mask = if prefix_len == 0 { + 0 + } else if prefix_len >= 128 { + u128::MAX + } else { + u128::MAX << (128 - prefix_len) + }; + (ip_bits & mask) == (net_bits & mask) + } + _ => false, // V4/V6 mismatch + } +} + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum JwtError { + #[error("no signing key configured")] + NoSigningKey, + + #[error("missing JWT token")] + MissingToken, + + #[error("JWT error: {0}")] + Jwt(#[from] jsonwebtoken::errors::Error), + + #[error("JWT token missing required fid claim")] + MissingFileIdClaim, + + #[error("file ID mismatch: expected {expected}, got {got}")] + FileIdMismatch { expected: String, got: String }, +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_jwt_round_trip() { + let key = SigningKey::from_string("test-secret-key"); + let token = gen_jwt(&key, 3600, "3,01637037d6").unwrap(); + let claims = decode_jwt(&key, &token).unwrap(); + assert_eq!(claims.fid, Some("3,01637037d6".to_string())); + } + + #[test] + fn test_jwt_no_signing_key() { + let key = SigningKey(vec![]); + assert!(gen_jwt(&key, 3600, "1,abc").is_err()); + } + + #[test] + fn test_jwt_invalid_token() { + let key = SigningKey::from_string("secret"); + let result = decode_jwt(&key, "invalid.token.here"); + assert!(result.is_err()); + } + + #[test] + fn test_jwt_wrong_key() { + let key1 = SigningKey::from_string("secret1"); + let key2 = SigningKey::from_string("secret2"); + let token = gen_jwt(&key1, 3600, "1,abc").unwrap(); + assert!(decode_jwt(&key2, &token).is_err()); + } + + #[test] + fn test_guard_empty_whitelist() { + let guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0); + assert!(guard.check_whitelist("192.168.1.1:8080")); + } + + #[test] + fn test_guard_whitelist_exact() { + let guard = Guard::new( + &["192.168.1.1".to_string(), "10.0.0.1".to_string()], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + ); + assert!(guard.check_whitelist("192.168.1.1:8080")); + assert!(guard.check_whitelist("10.0.0.1:1234")); + assert!(!guard.check_whitelist("172.16.0.1:8080")); + } + + #[test] + fn test_guard_whitelist_cidr() { + let guard = Guard::new( + &["10.0.0.0/8".to_string()], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + ); + assert!(guard.check_whitelist("10.1.2.3:8080")); + assert!(guard.check_whitelist("10.255.255.255:80")); + assert!(!guard.check_whitelist("11.0.0.1:80")); + } + + #[test] + fn test_guard_check_jwt_disabled() { + let guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0); + // No signing key = security disabled + assert!(guard.check_jwt(None, true).is_ok()); + assert!(guard.check_jwt(None, false).is_ok()); + } + + #[test] + fn test_guard_check_jwt_enabled() { + let key = SigningKey::from_string("write-secret"); + let read_key = SigningKey::from_string("read-secret"); + let guard = Guard::new(&[], key.clone(), 3600, read_key.clone(), 3600); + + // Missing token + assert!(guard.check_jwt(None, true).is_err()); + + // Valid write token + let token = gen_jwt(&key, 3600, "1,abc").unwrap(); + assert!(guard.check_jwt(Some(&token), true).is_ok()); + + // Write token for read should fail (different key) + assert!(guard.check_jwt(Some(&token), false).is_err()); + + // Valid read token + let read_token = gen_jwt(&read_key, 3600, "1,abc").unwrap(); + assert!(guard.check_jwt(Some(&read_token), false).is_ok()); + } + + #[test] + fn test_guard_check_jwt_file_id() { + let key = SigningKey::from_string("secret"); + let guard = Guard::new(&[], key.clone(), 3600, SigningKey(vec![]), 0); + + let token = gen_jwt(&key, 3600, "3,01637037d6").unwrap(); + + // Correct file ID + assert!(guard + .check_jwt_for_file(Some(&token), "3,01637037d6", true) + .is_ok()); + + // Wrong file ID + let err = guard.check_jwt_for_file(Some(&token), "4,deadbeef", true); + assert!(matches!(err, Err(JwtError::FileIdMismatch { .. }))); + } + + #[test] + fn test_extract_host() { + assert_eq!(extract_host("192.168.1.1:8080"), "192.168.1.1"); + assert_eq!(extract_host("[::1]:8080"), "::1"); + assert_eq!(extract_host("localhost"), "localhost"); + } + + #[test] + fn test_ip_in_cidr() { + let net: IpAddr = "10.0.0.0".parse().unwrap(); + let ip1: IpAddr = "10.1.2.3".parse().unwrap(); + let ip2: IpAddr = "11.0.0.1".parse().unwrap(); + assert!(ip_in_cidr(&ip1, &net, 8)); + assert!(!ip_in_cidr(&ip2, &net, 8)); + } +} diff --git a/seaweed-volume/src/security/tls.rs b/seaweed-volume/src/security/tls.rs new file mode 100644 index 000000000..8f8cb2403 --- /dev/null +++ b/seaweed-volume/src/security/tls.rs @@ -0,0 +1,437 @@ +use std::collections::HashSet; +use std::fmt; +use std::sync::Arc; + +use rustls::client::danger::HandshakeSignatureValid; +use rustls::crypto::aws_lc_rs; +use rustls::crypto::CryptoProvider; +use rustls::pki_types::UnixTime; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::server::danger::{ClientCertVerified, ClientCertVerifier}; +use rustls::server::WebPkiClientVerifier; +use rustls::{ + CipherSuite, DigitallySignedStruct, DistinguishedName, RootCertStore, ServerConfig, + SignatureScheme, SupportedCipherSuite, SupportedProtocolVersion, +}; +use x509_parser::prelude::{FromDer, X509Certificate}; + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct TlsPolicy { + pub min_version: String, + pub max_version: String, + pub cipher_suites: String, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct GrpcClientAuthPolicy { + pub allowed_common_names: Vec, + pub allowed_wildcard_domain: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TlsPolicyError(String); + +impl fmt::Display for TlsPolicyError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl std::error::Error for TlsPolicyError {} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +enum GoTlsVersion { + Ssl3, + Tls10, + Tls11, + Tls12, + Tls13, +} + +#[derive(Debug)] +struct CommonNameVerifier { + inner: Arc, + allowed_common_names: HashSet, + allowed_wildcard_domain: String, +} + +impl ClientCertVerifier for CommonNameVerifier { + fn offer_client_auth(&self) -> bool { + self.inner.offer_client_auth() + } + + fn client_auth_mandatory(&self) -> bool { + self.inner.client_auth_mandatory() + } + + fn root_hint_subjects(&self) -> &[DistinguishedName] { + self.inner.root_hint_subjects() + } + + fn verify_client_cert( + &self, + end_entity: &CertificateDer<'_>, + intermediates: &[CertificateDer<'_>], + now: UnixTime, + ) -> Result { + self.inner + .verify_client_cert(end_entity, intermediates, now)?; + let common_name = parse_common_name(end_entity).map_err(|e| { + rustls::Error::General(format!( + "parse client certificate common name failed: {}", + e + )) + })?; + if common_name_is_allowed( + &common_name, + &self.allowed_common_names, + &self.allowed_wildcard_domain, + ) { + return Ok(ClientCertVerified::assertion()); + } + Err(rustls::Error::General(format!( + "Authenticate: invalid subject client common name: {}", + common_name + ))) + } + + fn verify_tls12_signature( + &self, + message: &[u8], + cert: &CertificateDer<'_>, + dss: &DigitallySignedStruct, + ) -> Result { + self.inner.verify_tls12_signature(message, cert, dss) + } + + fn verify_tls13_signature( + &self, + message: &[u8], + cert: &CertificateDer<'_>, + dss: &DigitallySignedStruct, + ) -> Result { + self.inner.verify_tls13_signature(message, cert, dss) + } + + fn supported_verify_schemes(&self) -> Vec { + self.inner.supported_verify_schemes() + } +} + +pub fn build_rustls_server_config( + cert_path: &str, + key_path: &str, + ca_path: &str, + policy: &TlsPolicy, +) -> Result { + build_rustls_server_config_with_client_auth(cert_path, key_path, ca_path, policy, None) +} + +pub fn build_rustls_server_config_with_grpc_client_auth( + cert_path: &str, + key_path: &str, + ca_path: &str, + policy: &TlsPolicy, + client_auth_policy: &GrpcClientAuthPolicy, +) -> Result { + build_rustls_server_config_with_client_auth( + cert_path, + key_path, + ca_path, + policy, + Some(client_auth_policy), + ) +} + +fn build_rustls_server_config_with_client_auth( + cert_path: &str, + key_path: &str, + ca_path: &str, + policy: &TlsPolicy, + client_auth_policy: Option<&GrpcClientAuthPolicy>, +) -> Result { + let cert_chain = read_cert_chain(cert_path)?; + let private_key = read_private_key(key_path)?; + let provider = build_crypto_provider(policy)?; + let versions = build_supported_versions(policy)?; + + let builder = ServerConfig::builder_with_provider(provider.clone()) + .with_protocol_versions(&versions) + .map_err(|e| TlsPolicyError(format!("invalid TLS version policy: {}", e)))?; + + let builder = if ca_path.is_empty() { + builder.with_no_client_auth() + } else { + let roots = read_root_store(ca_path)?; + let verifier = + WebPkiClientVerifier::builder_with_provider(Arc::new(roots), provider.clone()) + .build() + .map_err(|e| TlsPolicyError(format!("build client verifier failed: {}", e)))?; + let verifier: Arc = if let Some(client_auth_policy) = + client_auth_policy.filter(|policy| { + !policy.allowed_common_names.is_empty() + || !policy.allowed_wildcard_domain.is_empty() + }) { + Arc::new(CommonNameVerifier { + inner: verifier, + allowed_common_names: client_auth_policy + .allowed_common_names + .iter() + .cloned() + .collect(), + allowed_wildcard_domain: client_auth_policy.allowed_wildcard_domain.clone(), + }) + } else { + verifier + }; + builder.with_client_cert_verifier(verifier) + }; + + builder + .with_single_cert(cert_chain, private_key) + .map_err(|e| TlsPolicyError(format!("build rustls server config failed: {}", e))) +} + +fn read_cert_chain(cert_path: &str) -> Result>, TlsPolicyError> { + let cert_pem = std::fs::read(cert_path).map_err(|e| { + TlsPolicyError(format!( + "Failed to read TLS cert file '{}': {}", + cert_path, e + )) + })?; + rustls_pemfile::certs(&mut &cert_pem[..]) + .collect::, _>>() + .map_err(|e| { + TlsPolicyError(format!( + "Failed to parse TLS cert PEM '{}': {}", + cert_path, e + )) + }) +} + +fn read_private_key(key_path: &str) -> Result, TlsPolicyError> { + let key_pem = std::fs::read(key_path).map_err(|e| { + TlsPolicyError(format!("Failed to read TLS key file '{}': {}", key_path, e)) + })?; + rustls_pemfile::private_key(&mut &key_pem[..]) + .map_err(|e| TlsPolicyError(format!("Failed to parse TLS key PEM '{}': {}", key_path, e)))? + .ok_or_else(|| TlsPolicyError(format!("No private key found in '{}'", key_path))) +} + +fn read_root_store(ca_path: &str) -> Result { + let ca_pem = std::fs::read(ca_path) + .map_err(|e| TlsPolicyError(format!("Failed to read TLS CA file '{}': {}", ca_path, e)))?; + let ca_certs = rustls_pemfile::certs(&mut &ca_pem[..]) + .collect::, _>>() + .map_err(|e| TlsPolicyError(format!("Failed to parse TLS CA PEM '{}': {}", ca_path, e)))?; + let mut roots = RootCertStore::empty(); + for cert in ca_certs { + roots + .add(cert) + .map_err(|e| TlsPolicyError(format!("Failed to add CA cert '{}': {}", ca_path, e)))?; + } + Ok(roots) +} + +fn build_crypto_provider(policy: &TlsPolicy) -> Result, TlsPolicyError> { + let mut provider = aws_lc_rs::default_provider(); + let cipher_suites = parse_cipher_suites(&provider.cipher_suites, &policy.cipher_suites)?; + if !cipher_suites.is_empty() { + provider.cipher_suites = cipher_suites; + } + Ok(Arc::new(provider)) +} + +pub fn build_supported_versions( + policy: &TlsPolicy, +) -> Result, TlsPolicyError> { + let min_version = parse_go_tls_version(&policy.min_version)?; + let max_version = parse_go_tls_version(&policy.max_version)?; + let versions = [&rustls::version::TLS13, &rustls::version::TLS12] + .into_iter() + .filter(|version| { + let current = go_tls_version_for_supported(version); + min_version.map(|min| current >= min).unwrap_or(true) + && max_version.map(|max| current <= max).unwrap_or(true) + }) + .collect::>(); + + if versions.is_empty() { + return Err(TlsPolicyError(format!( + "TLS version range min='{}' max='{}' is unsupported by rustls", + policy.min_version, policy.max_version + ))); + } + + Ok(versions) +} + +fn parse_go_tls_version(value: &str) -> Result, TlsPolicyError> { + match value.trim() { + "" => Ok(None), + "SSLv3" => Ok(Some(GoTlsVersion::Ssl3)), + "TLS 1.0" => Ok(Some(GoTlsVersion::Tls10)), + "TLS 1.1" => Ok(Some(GoTlsVersion::Tls11)), + "TLS 1.2" => Ok(Some(GoTlsVersion::Tls12)), + "TLS 1.3" => Ok(Some(GoTlsVersion::Tls13)), + other => Err(TlsPolicyError(format!("invalid TLS version {}", other))), + } +} + +fn parse_cipher_suites( + available: &[SupportedCipherSuite], + value: &str, +) -> Result, TlsPolicyError> { + let trimmed = value.trim(); + if trimmed.is_empty() { + return Ok(Vec::new()); + } + + trimmed + .split(',') + .map(|name| { + let suite = parse_cipher_suite_name(name.trim())?; + available + .iter() + .copied() + .find(|candidate| candidate.suite() == suite) + .ok_or_else(|| { + TlsPolicyError(format!( + "TLS cipher suite '{}' is unsupported by the Rust implementation", + name.trim() + )) + }) + }) + .collect() +} + +fn parse_cipher_suite_name(value: &str) -> Result { + match value { + "TLS_AES_128_GCM_SHA256" => Ok(CipherSuite::TLS13_AES_128_GCM_SHA256), + "TLS_AES_256_GCM_SHA384" => Ok(CipherSuite::TLS13_AES_256_GCM_SHA384), + "TLS_CHACHA20_POLY1305_SHA256" => Ok(CipherSuite::TLS13_CHACHA20_POLY1305_SHA256), + "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256) + } + "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384" => { + Ok(CipherSuite::TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384) + } + "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256) + } + "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) + } + "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384" => { + Ok(CipherSuite::TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) + } + "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256) + } + other => Err(TlsPolicyError(format!( + "TLS cipher suite '{}' is unsupported by the Rust implementation", + other + ))), + } +} + +fn parse_common_name(cert: &CertificateDer<'_>) -> Result { + let (_, certificate) = X509Certificate::from_der(cert.as_ref()) + .map_err(|e| TlsPolicyError(format!("parse X.509 certificate failed: {}", e)))?; + let common_name = certificate + .subject() + .iter_common_name() + .next() + .and_then(|common_name| common_name.as_str().ok()) + .map(str::to_string); + match common_name { + Some(common_name) => Ok(common_name), + None => Ok(String::new()), + } +} + +fn common_name_is_allowed( + common_name: &str, + allowed_common_names: &HashSet, + allowed_wildcard_domain: &str, +) -> bool { + (!allowed_wildcard_domain.is_empty() && common_name.ends_with(allowed_wildcard_domain)) + || allowed_common_names.contains(common_name) +} + +fn go_tls_version_for_supported(version: &SupportedProtocolVersion) -> GoTlsVersion { + match version.version { + rustls::ProtocolVersion::TLSv1_2 => GoTlsVersion::Tls12, + rustls::ProtocolVersion::TLSv1_3 => GoTlsVersion::Tls13, + _ => unreachable!("rustls only exposes TLS 1.2 and 1.3"), + } +} + +#[cfg(test)] +mod tests { + use super::{build_supported_versions, common_name_is_allowed, parse_cipher_suites, TlsPolicy}; + use rustls::crypto::aws_lc_rs; + use std::collections::HashSet; + + #[test] + fn test_build_supported_versions_defaults_to_tls12_and_tls13() { + let versions = build_supported_versions(&TlsPolicy::default()).unwrap(); + assert_eq!( + versions, + vec![&rustls::version::TLS13, &rustls::version::TLS12] + ); + } + + #[test] + fn test_build_supported_versions_filters_to_tls13() { + let versions = build_supported_versions(&TlsPolicy { + min_version: "TLS 1.3".to_string(), + max_version: "TLS 1.3".to_string(), + cipher_suites: String::new(), + }) + .unwrap(); + assert_eq!(versions, vec![&rustls::version::TLS13]); + } + + #[test] + fn test_build_supported_versions_rejects_unsupported_legacy_range() { + let err = build_supported_versions(&TlsPolicy { + min_version: "TLS 1.0".to_string(), + max_version: "TLS 1.1".to_string(), + cipher_suites: String::new(), + }) + .unwrap_err(); + assert!(err.to_string().contains("unsupported by rustls")); + } + + #[test] + fn test_parse_cipher_suites_accepts_go_names() { + let cipher_suites = parse_cipher_suites( + &aws_lc_rs::default_provider().cipher_suites, + "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_AES_128_GCM_SHA256", + ) + .unwrap(); + assert_eq!(cipher_suites.len(), 2); + } + + #[test] + fn test_common_name_is_allowed_matches_exact_and_wildcard() { + let allowed_common_names = + HashSet::from([String::from("volume-a.internal"), String::from("worker-7")]); + assert!(common_name_is_allowed( + "volume-a.internal", + &allowed_common_names, + "", + )); + assert!(common_name_is_allowed( + "node.prod.example.com", + &allowed_common_names, + ".example.com", + )); + assert!(!common_name_is_allowed( + "node.prod.other.net", + &allowed_common_names, + ".example.com", + )); + } +} diff --git a/seaweed-volume/src/server/debug.rs b/seaweed-volume/src/server/debug.rs new file mode 100644 index 000000000..dd1b69cf1 --- /dev/null +++ b/seaweed-volume/src/server/debug.rs @@ -0,0 +1,159 @@ +use axum::body::Body; +use axum::extract::Query; +use axum::http::{header, StatusCode}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{any, get}; +use axum::Router; +use pprof::protos::Message; +use serde::Deserialize; + +#[derive(Deserialize, Default)] +struct ProfileQuery { + seconds: Option, +} + +pub fn build_debug_router() -> Router { + Router::new() + .route("/debug/pprof/", get(pprof_index_handler)) + .route("/debug/pprof/cmdline", get(pprof_cmdline_handler)) + .route("/debug/pprof/profile", get(pprof_profile_handler)) + .route("/debug/pprof/symbol", any(pprof_symbol_handler)) + .route("/debug/pprof/trace", get(pprof_trace_handler)) +} + +async fn pprof_index_handler() -> Response { + let body = concat!( + "/debug/pprof/", + "cmdline
", + "profile
", + "symbol
", + "trace
", + "", + ); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/html; charset=utf-8")], + body, + ) + .into_response() +} + +async fn pprof_cmdline_handler() -> Response { + let body = std::env::args().collect::>().join("\0"); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/plain; charset=utf-8")], + body, + ) + .into_response() +} + +async fn pprof_profile_handler(Query(query): Query) -> Response { + let seconds = query.seconds.unwrap_or(30).clamp(1, 300); + let guard = match pprof::ProfilerGuard::new(100) { + Ok(guard) => guard, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to start profiler: {}", e), + ) + .into_response(); + } + }; + + tokio::time::sleep(std::time::Duration::from_secs(seconds)).await; + + let report = match guard.report().build() { + Ok(report) => report, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to build profile report: {}", e), + ) + .into_response(); + } + }; + + let profile = match report.pprof() { + Ok(profile) => profile, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to encode profile: {}", e), + ) + .into_response(); + } + }; + + let mut bytes = Vec::new(); + if let Err(e) = profile.encode(&mut bytes) { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to serialize profile: {}", e), + ) + .into_response(); + } + + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/octet-stream")], + bytes, + ) + .into_response() +} + +async fn pprof_symbol_handler() -> Response { + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/plain; charset=utf-8")], + "num_symbols: 0\n", + ) + .into_response() +} + +async fn pprof_trace_handler(Query(query): Query) -> Response { + let seconds = query.seconds.unwrap_or(1).clamp(1, 30); + tokio::time::sleep(std::time::Duration::from_secs(seconds)).await; + Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(Body::from(Vec::::new())) + .unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::http::Request; + use tower::ServiceExt; + + #[tokio::test] + async fn test_debug_index_route() { + let app = build_debug_router(); + let response = app + .oneshot( + Request::builder() + .uri("/debug/pprof/") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_debug_cmdline_route() { + let app = build_debug_router(); + let response = app + .oneshot( + Request::builder() + .uri("/debug/pprof/cmdline") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } +} diff --git a/seaweed-volume/src/server/favicon.ico b/seaweed-volume/src/server/favicon.ico new file mode 100644 index 000000000..05ddc02d5 Binary files /dev/null and b/seaweed-volume/src/server/favicon.ico differ diff --git a/seaweed-volume/src/server/grpc_client.rs b/seaweed-volume/src/server/grpc_client.rs new file mode 100644 index 000000000..2eee9d5dd --- /dev/null +++ b/seaweed-volume/src/server/grpc_client.rs @@ -0,0 +1,206 @@ +use std::error::Error; +use std::fmt; +use std::time::Duration; + +use hyper::http::Uri; +use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; + +use crate::config::VolumeServerConfig; + +pub const GRPC_MAX_MESSAGE_SIZE: usize = 1 << 30; +const GRPC_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(60); +const GRPC_KEEPALIVE_TIMEOUT: Duration = Duration::from_secs(20); +const GRPC_INITIAL_WINDOW_SIZE: u32 = 16 * 1024 * 1024; + +#[derive(Clone, Debug)] +pub struct OutgoingGrpcTlsConfig { + cert_pem: String, + key_pem: String, + ca_pem: String, +} + +#[derive(Debug)] +pub struct GrpcClientError(String); + +impl fmt::Display for GrpcClientError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl Error for GrpcClientError {} + +pub fn load_outgoing_grpc_tls( + config: &VolumeServerConfig, +) -> Result, GrpcClientError> { + if config.grpc_cert_file.is_empty() + || config.grpc_key_file.is_empty() + || config.grpc_ca_file.is_empty() + { + return Ok(None); + } + + let cert_pem = std::fs::read_to_string(&config.grpc_cert_file).map_err(|e| { + GrpcClientError(format!( + "Failed to read outgoing gRPC cert '{}': {}", + config.grpc_cert_file, e + )) + })?; + let key_pem = std::fs::read_to_string(&config.grpc_key_file).map_err(|e| { + GrpcClientError(format!( + "Failed to read outgoing gRPC key '{}': {}", + config.grpc_key_file, e + )) + })?; + let ca_pem = std::fs::read_to_string(&config.grpc_ca_file).map_err(|e| { + GrpcClientError(format!( + "Failed to read outgoing gRPC CA '{}': {}", + config.grpc_ca_file, e + )) + })?; + + Ok(Some(OutgoingGrpcTlsConfig { + cert_pem, + key_pem, + ca_pem, + })) +} + +pub fn grpc_endpoint_uri(grpc_host_port: &str, tls: Option<&OutgoingGrpcTlsConfig>) -> String { + let scheme = if tls.is_some() { "https" } else { "http" }; + format!("{}://{}", scheme, grpc_host_port) +} + +pub fn build_grpc_endpoint( + grpc_host_port: &str, + tls: Option<&OutgoingGrpcTlsConfig>, +) -> Result { + let uri = grpc_endpoint_uri(grpc_host_port, tls); + let mut endpoint = Channel::from_shared(uri.clone()) + .map_err(|e| GrpcClientError(format!("invalid gRPC endpoint {}: {}", uri, e)))? + .http2_keep_alive_interval(GRPC_KEEPALIVE_INTERVAL) + .keep_alive_timeout(GRPC_KEEPALIVE_TIMEOUT) + .keep_alive_while_idle(false) + .initial_stream_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .initial_connection_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .http2_adaptive_window(false); + + if let Some(tls) = tls { + let parsed = uri + .parse::() + .map_err(|e| GrpcClientError(format!("invalid gRPC endpoint {}: {}", uri, e)))?; + let host = parsed + .host() + .ok_or_else(|| GrpcClientError(format!("missing host in gRPC endpoint {}", uri)))?; + let tls_config = ClientTlsConfig::new() + .identity(Identity::from_pem( + tls.cert_pem.clone(), + tls.key_pem.clone(), + )) + .ca_certificate(Certificate::from_pem(tls.ca_pem.clone())) + .domain_name(host.to_string()); + endpoint = endpoint.tls_config(tls_config).map_err(|e| { + GrpcClientError(format!("configure gRPC TLS for {} failed: {}", uri, e)) + })?; + } + + Ok(endpoint) +} + +#[cfg(test)] +mod tests { + use super::{build_grpc_endpoint, grpc_endpoint_uri, load_outgoing_grpc_tls}; + use crate::config::{NeedleMapKind, ReadMode, VolumeServerConfig}; + use crate::security::tls::TlsPolicy; + + fn sample_config() -> VolumeServerConfig { + VolumeServerConfig { + port: 8080, + grpc_port: 18080, + public_port: 8080, + ip: "127.0.0.1".to_string(), + bind_ip: String::new(), + public_url: "127.0.0.1:8080".to_string(), + id: String::new(), + masters: vec![], + pre_stop_seconds: 0, + idle_timeout: 0, + data_center: String::new(), + rack: String::new(), + index_type: NeedleMapKind::InMemory, + disk_type: String::new(), + folders: vec![], + folder_max_limits: vec![], + folder_tags: vec![], + min_free_spaces: vec![], + disk_types: vec![], + idx_folder: String::new(), + white_list: vec![], + fix_jpg_orientation: false, + read_mode: ReadMode::Local, + cpu_profile: String::new(), + mem_profile: String::new(), + compaction_byte_per_second: 0, + maintenance_byte_per_second: 0, + file_size_limit_bytes: 0, + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(0), + inflight_download_data_timeout: std::time::Duration::from_secs(0), + has_slow_read: false, + read_buffer_size_mb: 0, + ldb_timeout: 0, + pprof: false, + metrics_port: 0, + metrics_ip: String::new(), + debug: false, + debug_port: 0, + ui_enabled: false, + jwt_signing_key: vec![], + jwt_signing_expires_seconds: 0, + jwt_read_signing_key: vec![], + jwt_read_signing_expires_seconds: 0, + https_cert_file: String::new(), + https_key_file: String::new(), + https_ca_file: String::new(), + https_client_enabled: false, + https_client_cert_file: String::new(), + https_client_key_file: String::new(), + https_client_ca_file: String::new(), + grpc_cert_file: String::new(), + grpc_key_file: String::new(), + grpc_ca_file: String::new(), + grpc_allowed_wildcard_domain: String::new(), + grpc_volume_allowed_common_names: vec![], + tls_policy: TlsPolicy::default(), + enable_write_queue: false, + security_file: String::new(), + } + } + + #[test] + fn test_grpc_endpoint_uri_uses_https_when_tls_enabled() { + let tls = super::OutgoingGrpcTlsConfig { + cert_pem: "cert".to_string(), + key_pem: "key".to_string(), + ca_pem: "ca".to_string(), + }; + assert_eq!( + grpc_endpoint_uri("master.example.com:19333", Some(&tls)), + "https://master.example.com:19333" + ); + } + + #[test] + fn test_load_outgoing_grpc_tls_requires_cert_key_and_ca() { + let mut config = sample_config(); + config.grpc_cert_file = "/tmp/client.pem".to_string(); + assert!(load_outgoing_grpc_tls(&config).unwrap().is_none()); + } + + #[test] + fn test_build_grpc_endpoint_without_tls_uses_http_scheme() { + let endpoint = build_grpc_endpoint("127.0.0.1:19333", None).unwrap(); + assert_eq!(endpoint.uri().scheme_str(), Some("http")); + } +} diff --git a/seaweed-volume/src/server/grpc_server.rs b/seaweed-volume/src/server/grpc_server.rs new file mode 100644 index 000000000..295583b0a --- /dev/null +++ b/seaweed-volume/src/server/grpc_server.rs @@ -0,0 +1,4536 @@ +//! gRPC service implementation for the volume server. +//! +//! Implements the VolumeServer trait generated from volume_server.proto. +//! 48 RPCs: core volume operations are fully implemented, streaming and +//! EC operations are stubbed with appropriate error messages. + +use std::pin::Pin; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use tokio_stream::Stream; +use tonic::{Request, Response, Status, Streaming}; + +use crate::pb::filer_pb; +use crate::pb::master_pb; +use crate::pb::master_pb::seaweed_client::SeaweedClient; +use crate::pb::volume_server_pb; +use crate::pb::volume_server_pb::volume_server_server::VolumeServer; +use crate::storage::needle::needle::{self, Needle}; +use crate::storage::types::*; + +use super::grpc_client::{build_grpc_endpoint, GRPC_MAX_MESSAGE_SIZE}; +use super::volume_server::VolumeServerState; + +type BoxStream = Pin> + Send + 'static>>; + +fn volume_is_remote_only(dat_path: &str, has_remote_file: bool) -> bool { + has_remote_file && !std::path::Path::new(dat_path).exists() +} + +/// Persist VolumeServerState to a state.pb file (matches Go's State.save). +fn save_state_file( + path: &str, + state: &volume_server_pb::VolumeServerState, +) -> Result<(), std::io::Error> { + if path.is_empty() { + return Ok(()); + } + use prost::Message; + let buf = state.encode_to_vec(); + std::fs::write(path, buf) +} + +/// Load VolumeServerState from a state.pb file (matches Go's State.Load). +pub fn load_state_file( + path: &str, +) -> Option { + if path.is_empty() || !std::path::Path::new(path).exists() { + return None; + } + let data = std::fs::read(path).ok()?; + use prost::Message; + volume_server_pb::VolumeServerState::decode(data.as_slice()).ok() +} + +struct WriteThrottler { + bytes_per_second: i64, + last_size_counter: i64, + last_size_check_time: std::time::Instant, +} + +impl WriteThrottler { + fn new(bytes_per_second: i64) -> Self { + Self { + bytes_per_second, + last_size_counter: 0, + last_size_check_time: std::time::Instant::now(), + } + } + + async fn maybe_slowdown(&mut self, delta: i64) { + if self.bytes_per_second <= 0 { + return; + } + + self.last_size_counter += delta; + let elapsed = self.last_size_check_time.elapsed(); + if elapsed <= std::time::Duration::from_millis(100) { + return; + } + + let over_limit_bytes = self.last_size_counter - self.bytes_per_second / 10; + if over_limit_bytes > 0 { + let over_ratio = over_limit_bytes as f64 / self.bytes_per_second as f64; + let sleep_time = std::time::Duration::from_millis((over_ratio * 1000.0) as u64); + if !sleep_time.is_zero() { + tokio::time::sleep(sleep_time).await; + } + } + + self.last_size_counter = 0; + self.last_size_check_time = std::time::Instant::now(); + } +} + +struct MasterVolumeInfo { + volume_id: VolumeId, + collection: String, + replica_placement: u8, + ttl: u32, + disk_type: String, + ip: String, + port: u16, +} + +pub struct VolumeGrpcService { + pub state: Arc, +} + +impl VolumeGrpcService { + async fn notify_master_volume_readonly( + &self, + info: &MasterVolumeInfo, + is_readonly: bool, + ) -> Result<(), Status> { + let master_url = self.state.master_url.clone(); + if master_url.is_empty() { + return Ok(()); + } + let grpc_addr = parse_grpc_address(&master_url).map_err(|e| { + Status::internal(format!("invalid master address {}: {}", master_url, e)) + })?; + let endpoint = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| Status::internal(format!("master address {}: {}", master_url, e)))? + .connect_timeout(std::time::Duration::from_secs(5)) + .timeout(std::time::Duration::from_secs(30)); + let channel = endpoint + .connect() + .await + .map_err(|e| Status::internal(format!("connect to master {}: {}", master_url, e)))?; + let mut client = SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + client + .volume_mark_readonly(master_pb::VolumeMarkReadonlyRequest { + ip: info.ip.clone(), + port: info.port as u32, + volume_id: info.volume_id.0, + collection: info.collection.clone(), + replica_placement: info.replica_placement as u32, + ttl: info.ttl, + disk_type: info.disk_type.clone(), + is_readonly, + ..Default::default() + }) + .await + .map_err(|e| { + Status::internal(format!( + "set volume {} readonly={} on master {}: {}", + info.volume_id, is_readonly, master_url, e + )) + })?; + Ok(()) + } + + /// Shared helper matching Go's `makeVolumeReadonly(ctx, v, persist)`. + /// 1. Check maintenance mode + /// 2. Notify master (readonly=true) + /// 3. Mark local volume readonly + /// 4. Notify master again (cover heartbeat race) + async fn make_volume_readonly(&self, vid: VolumeId, persist: bool) -> Result<(), Status> { + self.state.check_maintenance()?; + + let info = { + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + MasterVolumeInfo { + volume_id: vid, + collection: vol.collection.clone(), + replica_placement: vol.super_block.replica_placement.to_byte(), + ttl: vol.super_block.ttl.to_u32(), + disk_type: store.locations[loc_idx].disk_type.to_string(), + ip: store.ip.clone(), + port: store.port, + } + }; + + // Step 1: stop master from redirecting traffic here + self.notify_master_volume_readonly(&info, true).await?; + + // Step 2: mark local volume readonly + { + let mut store = self.state.store.write().unwrap(); + if let Some((_, vol)) = store.find_volume_mut(vid) { + vol.set_read_only_persist(persist) + .map_err(|e| Status::internal(e.to_string()))?; + } + self.state.volume_state_notify.notify_one(); + } + + // Step 3: notify master again to cover heartbeat race + self.notify_master_volume_readonly(&info, true).await?; + Ok(()) + } +} + +#[tonic::async_trait] +impl VolumeServer for VolumeGrpcService { + // ---- Core volume operations ---- + + async fn batch_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let mut results = Vec::new(); + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + for fid_str in &req.file_ids { + let file_id = match needle::FileId::parse(fid_str) { + Ok(fid) => fid, + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 400, // Bad Request + error: e, + size: 0, + version: 0, + }); + continue; + } + }; + + let mut n = Needle { + id: file_id.key, + cookie: file_id.cookie, + ..Needle::default() + }; + + // Check if this is an EC volume + let is_ec_volume = { + let store = self.state.store.read().unwrap(); + store.has_ec_volume(file_id.volume_id) + }; + + // Cookie validation (unless skip_cookie_check) + if !req.skip_cookie_check { + let original_cookie = n.cookie; + if !is_ec_volume { + let store = self.state.store.read().unwrap(); + match store.read_volume_needle(file_id.volume_id, &mut n) { + Ok(_) => {} + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: e.to_string(), + size: 0, + version: 0, + }); + continue; + } + } + } else { + // For EC volumes, verify needle exists in ecx index + let store = self.state.store.read().unwrap(); + if let Some(ec_vol) = store.find_ec_volume(file_id.volume_id) { + match ec_vol.find_needle_from_ecx(n.id) { + Ok(Some((_, size))) if !size.is_deleted() => { + // Needle exists and is not deleted — cookie check not possible + // for EC volumes without distributed read, so we accept it + n.data_size = size.0 as u32; + } + Ok(_) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: format!("ec needle {} not found", fid_str), + size: 0, + version: 0, + }); + continue; + } + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: e.to_string(), + size: 0, + version: 0, + }); + continue; + } + } + } else { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: format!("ec volume {} not found", file_id.volume_id), + size: 0, + version: 0, + }); + continue; + } + } + if n.cookie != original_cookie { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 400, + error: "File Random Cookie does not match.".to_string(), + size: 0, + version: 0, + }); + break; + } + } + + // Reject chunk manifest needles + if n.is_chunk_manifest() { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 406, + error: "ChunkManifest: not allowed in batch delete mode.".to_string(), + size: 0, + version: 0, + }); + continue; + } + + n.last_modified = now; + + if !is_ec_volume { + let mut store = self.state.store.write().unwrap(); + match store.delete_volume_needle(file_id.volume_id, &mut n) { + Ok(size) => { + if size.0 == 0 { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 304, + error: String::new(), + size: 0, + version: 0, + }); + } else { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 202, + error: String::new(), + size: size.0 as u32, + version: 0, + }); + } + } + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 500, + error: e.to_string(), + size: 0, + version: 0, + }); + } + } + } else { + // EC volume deletion: journal the delete locally (with cookie validation, matching Go) + let mut store = self.state.store.write().unwrap(); + if let Some(ec_vol) = store.find_ec_volume_mut(file_id.volume_id) { + match ec_vol.journal_delete_with_cookie(n.id, n.cookie) { + Ok(()) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 202, + error: String::new(), + size: n.data_size, + version: 0, + }); + } + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 500, + error: e.to_string(), + size: 0, + version: 0, + }); + } + } + } else { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: format!("ec volume {} not found", file_id.volume_id), + size: 0, + version: 0, + }); + } + } + } + + Ok(Response::new(volume_server_pb::BatchDeleteResponse { + results, + })) + } + + async fn vacuum_volume_check( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let garbage_ratio = match store.find_volume(vid) { + Some((_, vol)) => vol.garbage_level(), + None => return Err(Status::not_found(format!("not found volume id {}", vid))), + }; + Ok(Response::new(volume_server_pb::VacuumVolumeCheckResponse { + garbage_ratio, + })) + } + + type VacuumVolumeCompactStream = BoxStream; + async fn vacuum_volume_compact( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let preallocate = req.preallocate as u64; + let state = self.state.clone(); + + let (tx, rx) = tokio::sync::mpsc::channel(16); + + tokio::task::spawn_blocking(move || { + let compact_start = std::time::Instant::now(); + let report_interval: i64 = 128 * 1024 * 1024; + let next_report = std::sync::atomic::AtomicI64::new(report_interval); + + let tx_clone = tx.clone(); + let result = { + let mut store = state.store.write().unwrap(); + store.compact_volume(vid, preallocate, 0, |processed| { + let target = next_report.load(std::sync::atomic::Ordering::Relaxed); + if processed > target { + let resp = volume_server_pb::VacuumVolumeCompactResponse { + processed_bytes: processed, + load_avg_1m: 0.0, + }; + // If send fails (client disconnected), stop compaction + if tx_clone.blocking_send(Ok(resp)).is_err() { + return false; + } + next_report.store( + processed + report_interval, + std::sync::atomic::Ordering::Relaxed, + ); + } + true + }) + }; + + let success = result.is_ok(); + crate::metrics::VACUUMING_HISTOGRAM + .with_label_values(&["compact"]) + .observe(compact_start.elapsed().as_secs_f64()); + crate::metrics::VACUUMING_COMPACT_COUNTER + .with_label_values(&[if success { "true" } else { "false" }]) + .inc(); + + if let Err(e) = result { + let _ = tx.blocking_send(Err(Status::internal(e))); + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new( + Box::pin(stream) as Self::VacuumVolumeCompactStream + )) + } + + async fn vacuum_volume_commit( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let vid = VolumeId(request.into_inner().volume_id); + + // Match Go's store_vacuum.go CommitCompactVolume: skip commit if stopping + if *self.state.is_stopping.read().unwrap() { + return Err(Status::internal(format!( + "volume id {} skips compact commit because volume server is stopping", + vid.0 + ))); + } + + let commit_start = std::time::Instant::now(); + let mut store = self.state.store.write().unwrap(); + let result = store.commit_compact_volume(vid); + crate::metrics::VACUUMING_HISTOGRAM + .with_label_values(&["commit"]) + .observe(commit_start.elapsed().as_secs_f64()); + crate::metrics::VACUUMING_COMMIT_COUNTER + .with_label_values(&[if result.is_ok() { "true" } else { "false" }]) + .inc(); + match result { + Ok((is_read_only, volume_size)) => Ok(Response::new( + volume_server_pb::VacuumVolumeCommitResponse { + is_read_only, + volume_size, + }, + )), + Err(e) => Err(Status::internal(e)), + } + } + + async fn vacuum_volume_cleanup( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let vid = VolumeId(request.into_inner().volume_id); + let mut store = self.state.store.write().unwrap(); + match store.cleanup_compact_volume(vid) { + Ok(()) => Ok(Response::new( + volume_server_pb::VacuumVolumeCleanupResponse {}, + )), + Err(e) => Err(Status::internal(e)), + } + } + + async fn delete_collection( + &self, + request: Request, + ) -> Result, Status> { + let collection = &request.into_inner().collection; + let mut store = self.state.store.write().unwrap(); + store + .delete_collection(collection) + .map_err(|e| Status::internal(e))?; + Ok(Response::new(volume_server_pb::DeleteCollectionResponse {})) + } + + async fn allocate_volume( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let rp = crate::storage::super_block::ReplicaPlacement::from_string(&req.replication) + .map_err(|e| Status::invalid_argument(e.to_string()))?; + let ttl = if req.ttl.is_empty() { + None + } else { + Some( + crate::storage::needle::ttl::TTL::read(&req.ttl) + .map_err(|e| Status::invalid_argument(e))?, + ) + }; + let disk_type = DiskType::from_string(&req.disk_type); + + let version = if req.version > 0 { + crate::storage::types::Version(req.version as u8) + } else { + crate::storage::types::Version::current() + }; + + let mut store = self.state.store.write().unwrap(); + store + .add_volume( + vid, + &req.collection, + Some(rp), + ttl, + req.preallocate as u64, + disk_type, + version, + ) + .map_err(|e| Status::internal(e.to_string()))?; + self.state.volume_state_notify.notify_one(); + + Ok(Response::new(volume_server_pb::AllocateVolumeResponse {})) + } + + async fn volume_sync_status( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + Ok(Response::new(volume_server_pb::VolumeSyncStatusResponse { + volume_id: vid.0, + collection: vol.collection.clone(), + replication: vol.super_block.replica_placement.to_string(), + ttl: vol.super_block.ttl.to_string(), + tail_offset: vol.dat_file_size().unwrap_or(0), + compact_revision: vol.super_block.compaction_revision as u32, + idx_file_size: vol.idx_file_size(), + version: vol.version().0 as u32, + })) + } + + type VolumeIncrementalCopyStream = BoxStream; + async fn volume_incremental_copy( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Sync to disk first + { + let mut store = self.state.store.write().unwrap(); + if let Some((_, v)) = store.find_volume_mut(vid) { + let _ = v.sync_to_disk(); + } + } + + let store = self.state.store.read().unwrap(); + let (_, v) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + let dat_size = v.dat_file_size().unwrap_or(0); + let super_block_size = v.super_block.block_size() as u64; + + // If since_ns is very large (after all data), return empty + if req.since_ns == u64::MAX || dat_size <= super_block_size { + drop(store); + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + + // Use binary search to find the starting offset + let start_offset = if req.since_ns == 0 { + super_block_size + } else { + match v.binary_search_by_append_at_ns(req.since_ns) { + Ok((_offset, true)) => { + // All entries are before since_ns — nothing to send + drop(store); + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + Ok((offset, false)) => { + let actual = offset.to_actual_offset(); + if actual <= 0 { + super_block_size + } else { + actual as u64 + } + } + Err(e) => { + return Err(Status::internal(format!( + "fail to locate by appendAtNs {}: {}", + req.since_ns, e + ))); + } + } + }; + let mut results = Vec::new(); + let mut bytes_to_read = (dat_size - start_offset) as i64; + let buffer_size = 2 * 1024 * 1024; + let mut offset = start_offset; + + while bytes_to_read > 0 { + let chunk = std::cmp::min(bytes_to_read as usize, buffer_size); + match v.read_dat_slice(offset, chunk) { + Ok(buf) if buf.is_empty() => break, + Ok(buf) => { + let read_len = buf.len() as i64; + results.push(Ok(volume_server_pb::VolumeIncrementalCopyResponse { + file_content: buf, + })); + bytes_to_read -= read_len; + offset += read_len as u64; + } + Err(e) => return Err(Status::internal(e.to_string())), + } + } + + drop(store); + let stream = tokio_stream::iter(results); + Ok(Response::new(Box::pin(stream))) + } + + async fn volume_mount( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let mut store = self.state.store.write().unwrap(); + store + .mount_volume_by_id(vid) + .map_err(|e| Status::internal(e.to_string()))?; + self.state.volume_state_notify.notify_one(); + + Ok(Response::new(volume_server_pb::VolumeMountResponse {})) + } + + async fn volume_unmount( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let mut store = self.state.store.write().unwrap(); + // Go returns nil when volume is not found (idempotent unmount) + if store.unmount_volume(vid) { + self.state.volume_state_notify.notify_one(); + } + Ok(Response::new(volume_server_pb::VolumeUnmountResponse {})) + } + + async fn volume_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let mut store = self.state.store.write().unwrap(); + if req.only_empty { + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + if vol.file_count() > 0 { + return Err(Status::failed_precondition("volume not empty")); + } + } + store + .delete_volume(vid, req.only_empty) + .map_err(|e| Status::internal(e.to_string()))?; + self.state.volume_state_notify.notify_one(); + Ok(Response::new(volume_server_pb::VolumeDeleteResponse {})) + } + + async fn volume_mark_readonly( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + // Go: volume lookup (L239-241) happens before maintenance check (L166 in makeVolumeReadonly) + { + let store = self.state.store.read().unwrap(); + store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + } + self.make_volume_readonly(vid, req.persist).await?; + Ok(Response::new( + volume_server_pb::VolumeMarkReadonlyResponse {}, + )) + } + + async fn volume_mark_writable( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let info = { + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + MasterVolumeInfo { + volume_id: vid, + collection: vol.collection.clone(), + replica_placement: vol.super_block.replica_placement.to_byte(), + ttl: vol.super_block.ttl.to_u32(), + disk_type: store.locations[loc_idx].disk_type.to_string(), + ip: store.ip.clone(), + port: store.port, + } + }; + // Go: maintenance check (L194 in makeVolumeWritable) happens after volume lookup (L253-255) + self.state.check_maintenance()?; + + // Step 1: mark local volume as writable (save result; Go continues on error) + let mark_result = { + let mut store = self.state.store.write().unwrap(); + let res = store + .find_volume_mut(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid))) + .and_then(|(_, vol)| { + vol.set_writable() + .map_err(|e| Status::internal(e.to_string())) + }); + if res.is_ok() { + self.state.volume_state_notify.notify_one(); + } + res + }; + + // Step 2: Go returns early if marking failed (L198-200), before notifying master. + mark_result?; + // Step 3: enable master to redirect traffic here + self.notify_master_volume_readonly(&info, false).await?; + Ok(Response::new( + volume_server_pb::VolumeMarkWritableResponse {}, + )) + } + + async fn volume_configure( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Validate replication string — return response error, not gRPC error + let rp = match crate::storage::super_block::ReplicaPlacement::from_string(&req.replication) + { + Ok(rp) => rp, + Err(e) => { + return Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error: format!("volume configure replication {}: {}", req.replication, e), + })); + } + }; + + let mut store = self.state.store.write().unwrap(); + + // Unmount the volume (Go propagates unmount errors via resp.Error; + // Rust unmount_volume returns bool, so not-found falls through to configure_volume) + store.unmount_volume(vid); + + // Modify the super block on disk (replica_placement byte) + if let Err(e) = store.configure_volume(vid, rp) { + let mut error = format!("volume configure {}: {}", vid, e); + // Error recovery: try to re-mount anyway + if let Err(mount_err) = store.mount_volume_by_id(vid) { + error += &format!(". Also failed to restore mount: {}", mount_err); + } + return Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error, + })); + } + + // Re-mount the volume + if let Err(e) = store.mount_volume_by_id(vid) { + return Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error: format!("volume configure mount {}: {}", vid, e), + })); + } + self.state.volume_state_notify.notify_one(); + + Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error: String::new(), + })) + } + + async fn volume_status( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + // Go checks v.DataBackend != nil before building the response. + if !vol.has_data_backend() { + return Err(Status::internal(format!( + "volume {} data backend not found", + vid + ))); + } + + // Go uses v.DataBackend.GetStat() which returns the actual .dat file size + let volume_size = vol.dat_file_size().unwrap_or(0); + + Ok(Response::new(volume_server_pb::VolumeStatusResponse { + is_read_only: vol.is_read_only(), + volume_size, + file_count: vol.file_count() as u64, + file_deleted_count: vol.deleted_count() as u64, + })) + } + + async fn get_state( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(volume_server_pb::GetStateResponse { + state: Some(volume_server_pb::VolumeServerState { + maintenance: self.state.maintenance.load(Ordering::Relaxed), + version: self.state.state_version.load(Ordering::Relaxed), + }), + })) + } + + async fn set_state( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + if let Some(new_state) = &req.state { + // Go's State.Update checks version: if incoming version != stored version → error. + let current_version = self.state.state_version.load(Ordering::Relaxed); + if new_state.version != current_version { + return Err(Status::failed_precondition(format!( + "version mismatch for VolumeServerState (got {}, want {})", + new_state.version, current_version + ))); + } + + // Save previous state for rollback on persistence failure (matches Go) + let prev_maintenance = self.state.maintenance.load(Ordering::Relaxed); + let prev_version = current_version; + + self.state + .maintenance + .store(new_state.maintenance, Ordering::Relaxed); + let new_version = self.state.state_version.fetch_add(1, Ordering::Relaxed) + 1; + + // Persist to disk (matches Go's State.save) + let pb = volume_server_pb::VolumeServerState { + maintenance: new_state.maintenance, + version: new_version, + }; + if let Err(e) = save_state_file(&self.state.state_file_path, &pb) { + // Rollback in-memory state on save failure (matches Go) + self.state.maintenance.store(prev_maintenance, Ordering::Relaxed); + self.state.state_version.store(prev_version, Ordering::Relaxed); + return Err(Status::internal(format!("failed to save state: {}", e))); + } + + Ok(Response::new(volume_server_pb::SetStateResponse { + state: Some(pb), + })) + } else { + // nil state = no-op, return current state + Ok(Response::new(volume_server_pb::SetStateResponse { + state: Some(volume_server_pb::VolumeServerState { + maintenance: self.state.maintenance.load(Ordering::Relaxed), + version: self.state.state_version.load(Ordering::Relaxed), + }), + })) + } + } + + type VolumeCopyStream = BoxStream; + async fn volume_copy( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // If volume already exists locally, delete it first + { + let store = self.state.store.read().unwrap(); + if store.find_volume(vid).is_some() { + drop(store); + let mut store = self.state.store.write().unwrap(); + store.delete_volume(vid, false).map_err(|e| { + Status::internal(format!("failed to delete existing volume {}: {}", vid, e)) + })?; + self.state.volume_state_notify.notify_one(); + } + } + + // Parse source_data_node address: "ip:port.grpcPort" or "ip:port" (grpc = port + 10000) + let source = &req.source_data_node; + let grpc_addr = parse_grpc_address(source).map_err(|e| { + Status::internal(format!( + "VolumeCopy volume {} invalid source_data_node {}: {}", + vid, source, e + )) + })?; + + let channel = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| { + Status::internal(format!("VolumeCopy volume {} parse source: {}", vid, e)) + })? + .connect() + .await + .map_err(|e| { + Status::internal(format!( + "VolumeCopy volume {} connect to {}: {}", + vid, grpc_addr, e + )) + })?; + + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + // Get file status from source + let vol_info = client + .read_volume_file_status(volume_server_pb::ReadVolumeFileStatusRequest { + volume_id: req.volume_id, + }) + .await + .map_err(|e| Status::internal(format!("read volume file status failed, {}", e)))? + .into_inner(); + + let requested_disk_type = if !req.disk_type.is_empty() { + DiskType::from_string(&req.disk_type) + } else { + DiskType::from_string(&vol_info.disk_type) + }; + + // Find a free disk location using Go's Store.FindFreeLocation semantics. + let (data_base, idx_base, selected_disk_type) = { + let store = self.state.store.read().unwrap(); + let Some(loc_idx) = store.find_free_location_predicate(|loc| { + loc.disk_type == requested_disk_type + && loc.available_space.load(Ordering::Relaxed) > vol_info.dat_file_size + }) else { + return Err(Status::internal(format!( + "no space left {}", + requested_disk_type.readable_string() + ))); + }; + let loc = &store.locations[loc_idx]; + ( + loc.directory.clone(), + loc.idx_directory.clone(), + loc.disk_type.clone(), + ) + }; + + let data_base_name = + crate::storage::volume::volume_file_name(&data_base, &vol_info.collection, vid); + let idx_base_name = + crate::storage::volume::volume_file_name(&idx_base, &vol_info.collection, vid); + + // Write a .note file to indicate copy in progress + let note_path = format!("{}.note", data_base_name); + let _ = std::fs::write(¬e_path, format!("copying from {}", source)); + + let has_remote_dat = vol_info + .volume_info + .as_ref() + .map(|vi| !vi.files.is_empty()) + .unwrap_or(false); + + let (tx, rx) = + tokio::sync::mpsc::channel::>(16); + let state = self.state.clone(); + + tokio::spawn(async move { + let result = async { + let report_interval: i64 = 128 * 1024 * 1024; + let mut next_report_target: i64 = report_interval; + let io_byte_per_second = if req.io_byte_per_second > 0 { + req.io_byte_per_second + } else { + state.maintenance_byte_per_second + }; + let mut throttler = WriteThrottler::new(io_byte_per_second); + + // Query master for preallocation settings (matching Go VolumeCopy behavior). + let mut preallocate_size: i64 = 0; + if !has_remote_dat { + let grpc_addr = super::heartbeat::to_grpc_address(&state.master_url); + match super::heartbeat::try_get_master_configuration( + &grpc_addr, + state.outgoing_grpc_tls.as_ref(), + ) + .await + { + Ok(resp) => { + if resp.volume_preallocate { + preallocate_size = resp.volume_size_limit_m_b as i64 * 1024 * 1024; + } + } + Err(e) => { + tracing::warn!("get master {} configuration: {}", state.master_url, e); + } + } + + if preallocate_size > 0 { + let dat_path = format!("{}.dat", data_base_name); + let file = std::fs::File::create(&dat_path).map_err(|e| { + Status::internal(format!( + "create preallocated volume file {}: {}", + dat_path, e + )) + })?; + file.set_len(preallocate_size as u64).map_err(|e| { + Status::internal(format!("preallocate volume file {}: {}", dat_path, e)) + })?; + } + } + + // Copy .dat file + if !has_remote_dat { + let dat_path = format!("{}.dat", data_base_name); + let dat_modified_ts_ns = copy_file_from_source( + &mut client, + false, + &req.collection, + req.volume_id, + vol_info.compaction_revision, + vol_info.dat_file_size, + &dat_path, + ".dat", + false, + true, + Some(&tx), + &mut next_report_target, + report_interval, + &mut throttler, + ) + .await + .map_err(|e| Status::internal(e))?; + if dat_modified_ts_ns > 0 { + set_file_mtime(&dat_path, dat_modified_ts_ns); + } + } + + // Copy .idx file + let idx_path = format!("{}.idx", idx_base_name); + let idx_modified_ts_ns = copy_file_from_source( + &mut client, + false, + &req.collection, + req.volume_id, + vol_info.compaction_revision, + vol_info.idx_file_size, + &idx_path, + ".idx", + false, + false, + None, + &mut next_report_target, + report_interval, + &mut throttler, + ) + .await + .map_err(|e| Status::internal(e))?; + if idx_modified_ts_ns > 0 { + set_file_mtime(&idx_path, idx_modified_ts_ns); + } + + // Copy .vif file (ignore if not found on source) + let vif_path = format!("{}.vif", data_base_name); + let vif_modified_ts_ns = copy_file_from_source( + &mut client, + false, + &req.collection, + req.volume_id, + vol_info.compaction_revision, + 1024 * 1024, + &vif_path, + ".vif", + false, + true, + None, + &mut next_report_target, + report_interval, + &mut throttler, + ) + .await + .map_err(|e| Status::internal(e))?; + if vif_modified_ts_ns > 0 { + set_file_mtime(&vif_path, vif_modified_ts_ns); + } + + // Remove the .note file + let _ = std::fs::remove_file(¬e_path); + + // Verify file sizes + if !has_remote_dat { + let dat_path = format!("{}.dat", data_base_name); + check_copy_file_size(&dat_path, vol_info.dat_file_size)?; + } + if vol_info.idx_file_size > 0 { + check_copy_file_size(&idx_path, vol_info.idx_file_size)?; + } + + // Find last_append_at_ns from copied files + let last_append_at_ns = if !has_remote_dat { + find_last_append_at_ns( + &idx_path, + &format!("{}.dat", data_base_name), + vol_info.version, + ) + .unwrap_or(vol_info.dat_file_timestamp_seconds * 1_000_000_000) + } else { + vol_info.dat_file_timestamp_seconds * 1_000_000_000 + }; + + // Mount the volume + { + let mut store = state.store.write().unwrap(); + store + .mount_volume(vid, &vol_info.collection, selected_disk_type) + .map_err(|e| { + Status::internal(format!("failed to mount volume {}: {}", vid, e)) + })?; + } + state.volume_state_notify.notify_one(); + + // Send final response with last_append_at_ns + let _ = tx + .send(Ok(volume_server_pb::VolumeCopyResponse { + last_append_at_ns: last_append_at_ns, + processed_bytes: 0, + })) + .await; + + Ok::<(), Status>(()) + } + .await; + + if let Err(e) = result { + // Clean up on error + let _ = std::fs::remove_file(format!("{}.dat", data_base_name)); + let _ = std::fs::remove_file(format!("{}.idx", idx_base_name)); + let _ = std::fs::remove_file(format!("{}.vif", data_base_name)); + let _ = std::fs::remove_file(¬e_path); + let _ = tx.send(Err(e)).await; + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new(Box::pin(stream))) + } + + async fn read_volume_file_status( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + let mod_time = vol.dat_file_mod_time(); + Ok(Response::new( + volume_server_pb::ReadVolumeFileStatusResponse { + volume_id: vid.0, + idx_file_timestamp_seconds: mod_time, + idx_file_size: vol.idx_file_size(), + dat_file_timestamp_seconds: mod_time, + dat_file_size: vol.dat_file_size().unwrap_or(0), + file_count: vol.file_count() as u64, + compaction_revision: vol.super_block.compaction_revision as u32, + collection: vol.collection.clone(), + disk_type: store.locations[loc_idx].disk_type.to_string(), + volume_info: Some(vol.volume_info.clone()), + version: vol.version().0 as u32, + }, + )) + } + + type CopyFileStream = BoxStream; + async fn copy_file( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let file_name: String; + + if !req.is_ec_volume { + // Sync volume to disk before copying (matching Go's v.SyncToDisk()) + { + let mut store = self.state.store.write().unwrap(); + if let Some((_, v)) = store.find_volume_mut(vid) { + let _ = v.sync_to_disk(); + } + } + + let store = self.state.store.read().unwrap(); + let (_, v) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + // Check compaction revision + if req.compaction_revision != u32::MAX + && v.last_compact_revision() != req.compaction_revision as u16 + { + return Err(Status::failed_precondition(format!( + "volume {} is compacted", + vid.0 + ))); + } + + file_name = v.file_name(&req.ext); + drop(store); + } else { + // Sync EC volume journal to disk before copying (matching Go's ecv.SyncToDisk()) + { + let store = self.state.store.read().unwrap(); + if let Some(ecv) = store.find_ec_volume(vid) { + let _ = ecv.sync_to_disk(); + } + } + + // EC volume: search disk locations for the file + let store = self.state.store.read().unwrap(); + let mut found_path = None; + let ec_base = if req.collection.is_empty() { + format!("{}{}", vid.0, req.ext) + } else { + format!("{}_{}{}", req.collection, vid.0, req.ext) + }; + for loc in &store.locations { + let path = format!("{}/{}", loc.directory, ec_base); + if std::path::Path::new(&path).exists() { + found_path = Some(path); + } + let idx_path = format!("{}/{}", loc.idx_directory, ec_base); + if std::path::Path::new(&idx_path).exists() { + found_path = Some(idx_path); + } + } + drop(store); + + match found_path { + Some(p) => file_name = p, + None => { + if req.ignore_source_file_not_found { + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + return Err(Status::not_found(format!( + "CopyFile not found ec volume id {}", + vid.0 + ))); + } + } + } + + // Open file and read content + let file = match std::fs::File::open(&file_name) { + Ok(f) => f, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + if req.ignore_source_file_not_found || req.stop_offset == 0 { + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + return Err(Status::not_found(format!("{}", e))); + } + Err(e) => return Err(Status::internal(e.to_string())), + }; + + let metadata = file + .metadata() + .map_err(|e| Status::internal(e.to_string()))?; + let mod_ts_ns = metadata + .modified() + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_nanos() as i64) + .unwrap_or(0); + + let mut results: Vec> = Vec::new(); + let mut bytes_to_read = req.stop_offset as i64; + let mut reader = std::io::BufReader::new(file); + let buffer_size = 2 * 1024 * 1024; // 2MB chunks + let mut first = true; + + use std::io::Read; + while bytes_to_read > 0 { + let chunk_size = std::cmp::min(bytes_to_read as usize, buffer_size); + let mut buf = vec![0u8; chunk_size]; + match reader.read(&mut buf) { + Ok(0) => break, // EOF + Ok(n) => { + buf.truncate(n); + if n as i64 > bytes_to_read { + buf.truncate(bytes_to_read as usize); + } + results.push(Ok(volume_server_pb::CopyFileResponse { + file_content: buf, + modified_ts_ns: if first { mod_ts_ns } else { 0 }, + })); + first = false; + bytes_to_read -= n as i64; + } + Err(e) => return Err(Status::internal(e.to_string())), + } + } + + // If no data was sent, still send ModifiedTsNs + if first && mod_ts_ns != 0 { + results.push(Ok(volume_server_pb::CopyFileResponse { + file_content: vec![], + modified_ts_ns: mod_ts_ns, + })); + } + + let stream = tokio_stream::iter(results); + Ok(Response::new(Box::pin(stream))) + } + + async fn receive_file( + &self, + request: Request>, + ) -> Result, Status> { + self.state.check_maintenance()?; + + let mut stream = request.into_inner(); + let mut target_file: Option = None; + let mut file_path: Option = None; + let mut bytes_written: u64 = 0; + let mut resp_error: Option = None; + + let result: Result<(), Status> = async { + while let Some(req) = stream.message().await? { + match req.data { + Some(volume_server_pb::receive_file_request::Data::Info(info)) => { + // Determine file path + let path = if info.is_ec_volume { + let store = self.state.store.read().unwrap(); + // Go prefers a HardDriveType location, then falls back to first + let dir = store + .locations + .iter() + .find(|loc| loc.disk_type == DiskType::HardDrive) + .or_else(|| store.locations.first()) + .map(|loc| loc.directory.clone()); + drop(store); + let dir = match dir { + Some(d) => d, + None => { + resp_error = Some("no storage location available".to_string()); + break; + } + }; + let ec_base = if info.collection.is_empty() { + format!("{}", info.volume_id) + } else { + format!("{}_{}", info.collection, info.volume_id) + }; + format!("{}/{}{}", dir, ec_base, info.ext) + } else { + let store = self.state.store.read().unwrap(); + let (_, v) = + store.find_volume(VolumeId(info.volume_id)).ok_or_else(|| { + Status::not_found(format!( + "volume {} not found", + info.volume_id + )) + })?; + let p = v.file_name(&info.ext); + drop(store); + p + }; + + target_file = Some(std::fs::File::create(&path).map_err(|e| { + Status::internal(format!("failed to create file: {}", e)) + })?); + file_path = Some(path); + } + Some(volume_server_pb::receive_file_request::Data::FileContent(content)) => { + if let Some(ref mut f) = target_file { + use std::io::Write; + match f.write(&content) { + Ok(n) => bytes_written += n as u64, + Err(e) => { + // Match Go: write failures are response-level errors, not gRPC errors + resp_error = Some(format!("failed to write file: {}", e)); + break; + } + } + } else { + // Go returns protocol violations as response-level errors + resp_error = Some("file info must be sent first".to_string()); + break; + } + } + None => { + resp_error = Some("unknown message type".to_string()); + break; + } + } + } + Ok(()) + } + .await; + + match result { + Ok(()) => { + // Check for protocol-level errors (returned in response body, not gRPC status) + if let Some(err_msg) = resp_error { + return Ok(Response::new(volume_server_pb::ReceiveFileResponse { + error: err_msg, + bytes_written: 0, + })); + } + if let Some(ref f) = target_file { + let _ = f.sync_all(); + } + Ok(Response::new(volume_server_pb::ReceiveFileResponse { + error: String::new(), + bytes_written, + })) + } + Err(e) => { + // Clean up partial file on stream error (Go parity: closes file, removes it) + if let Some(f) = target_file.take() { + drop(f); + } + if let Some(ref p) = file_path { + let _ = std::fs::remove_file(p); + } + Err(e) + } + } + } + + async fn read_needle_blob( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let offset = req.offset; + let size = Size(req.size); + + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + let blob = vol.read_needle_blob(offset, size).map_err(|e| { + Status::internal(format!( + "read needle blob offset {} size {}: {}", + offset, size.0, e + )) + })?; + + Ok(Response::new(volume_server_pb::ReadNeedleBlobResponse { + needle_blob: blob, + })) + } + + async fn read_needle_meta( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.needle_id); + + let store = self.state.store.read().unwrap(); + let (_, vol) = store.find_volume(vid).ok_or_else(|| { + Status::not_found(format!( + "not found volume id {} and read needle metadata at ec shards is not supported", + vid + )) + })?; + + let offset = req.offset; + let size = crate::storage::types::Size(req.size); + + let mut n = Needle { + id: needle_id, + flags: 0x08, + ..Needle::default() + }; + vol.read_needle_meta_at(&mut n, offset, size) + .map_err(|e| Status::internal(format!("read needle meta: {}", e)))?; + + let ttl_str = n.ttl.as_ref().map_or(String::new(), |t| t.to_string()); + Ok(Response::new(volume_server_pb::ReadNeedleMetaResponse { + cookie: n.cookie.0, + last_modified: n.last_modified, + crc: n.checksum.0, + ttl: ttl_str, + append_at_ns: n.append_at_ns, + })) + } + + async fn write_needle_blob( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.needle_id); + let size = Size(req.size); + + let mut store = self.state.store.write().unwrap(); + let (_, vol) = store + .find_volume_mut(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + vol.write_needle_blob_and_index(needle_id, &req.needle_blob, size) + .map_err(|e| { + Status::internal(format!( + "write blob needle {} size {}: {}", + needle_id.0, size.0, e + )) + })?; + + Ok(Response::new(volume_server_pb::WriteNeedleBlobResponse {})) + } + + type ReadAllNeedlesStream = BoxStream; + async fn read_all_needles( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let state = self.state.clone(); + + let (tx, rx) = tokio::sync::mpsc::channel(32); + + // Stream needles lazily via a blocking task (matches Go's scanner pattern) + tokio::task::spawn_blocking(move || { + let store = state.store.read().unwrap(); + for &raw_vid in &req.volume_ids { + let vid = VolumeId(raw_vid); + let v = match store.find_volume(vid) { + Some((_, v)) => v, + None => { + let _ = tx.blocking_send(Err(Status::not_found(format!( + "not found volume id {}", + vid + )))); + return; + } + }; + + let needles = match v.read_all_needles() { + Ok(n) => n, + Err(e) => { + let _ = tx.blocking_send(Err(Status::internal(e.to_string()))); + return; + } + }; + + for n in needles { + let compressed = n.is_compressed(); + if tx + .blocking_send(Ok(volume_server_pb::ReadAllNeedlesResponse { + volume_id: raw_vid, + needle_id: n.id.into(), + cookie: n.cookie.0, + needle_blob: n.data, + needle_blob_compressed: compressed, + last_modified: n.last_modified, + crc: n.checksum.0, + name: n.name, + mime: n.mime, + })) + .is_err() + { + return; // receiver dropped + } + } + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new(Box::pin(stream))) + } + + type VolumeTailSenderStream = BoxStream; + async fn volume_tail_sender( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let (version, sb_size) = { + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + (vol.version().0 as u32, vol.super_block.block_size() as u64) + }; + + let state = self.state.clone(); + let (tx, rx) = tokio::sync::mpsc::channel(32); + const BUFFER_SIZE_LIMIT: usize = 2 * 1024 * 1024; + + tokio::spawn(async move { + let since_ns = req.since_ns; + let idle_timeout = req.idle_timeout_seconds; + let mut last_timestamp_ns = since_ns; + let mut draining_seconds = idle_timeout as i64; + + loop { + // Use binary search to find starting offset, then scan from there + let scan_result = { + let store = state.store.read().unwrap(); + if let Some((_, vol)) = store.find_volume(vid) { + let start_offset = if last_timestamp_ns > 0 { + match vol.binary_search_by_append_at_ns(last_timestamp_ns) { + Ok((offset, _is_last)) => { + if offset.is_zero() { + Ok(sb_size) + } else { + Ok(offset.to_actual_offset() as u64) + } + } + Err(e) => { + tracing::warn!( + "fail to locate by appendAtNs {}: {}", + last_timestamp_ns, + e + ); + Err(format!( + "fail to locate by appendAtNs {}: {}", + last_timestamp_ns, e + )) + } + } + } else { + Ok(sb_size) + }; + match start_offset { + Ok(off) => Ok(vol.scan_raw_needles_from(off)), + Err(msg) => Err(msg), + } + } else { + break; + } + }; + + let scan_inner = match scan_result { + Ok(r) => r, + Err(msg) => { + let _ = tx.send(Err(Status::internal(msg))).await; + return; + } + }; + + let entries = match scan_inner { + Ok(e) => e, + Err(_) => break, + }; + + // Filter entries since last_timestamp_ns + let mut last_processed_ns = last_timestamp_ns; + let mut sent_any = false; + for (header, body, append_at_ns) in &entries { + if *append_at_ns <= last_timestamp_ns && last_timestamp_ns > 0 { + continue; + } + sent_any = true; + // Send body in chunks of BUFFER_SIZE_LIMIT + // Go sends needle_header on every chunk + let mut i = 0; + while i < body.len() { + let end = std::cmp::min(i + BUFFER_SIZE_LIMIT, body.len()); + let is_last_chunk = end >= body.len(); + let msg = volume_server_pb::VolumeTailSenderResponse { + needle_header: header.clone(), + needle_body: body[i..end].to_vec(), + is_last_chunk, + version, + }; + if tx.send(Ok(msg)).await.is_err() { + return; + } + i = end; + } + if *append_at_ns > last_processed_ns { + last_processed_ns = *append_at_ns; + } + } + + if !sent_any { + // Send heartbeat + let msg = volume_server_pb::VolumeTailSenderResponse { + is_last_chunk: true, + version, + ..Default::default() + }; + if tx.send(Ok(msg)).await.is_err() { + return; + } + } + + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + + if idle_timeout == 0 { + last_timestamp_ns = last_processed_ns; + continue; + } + if last_processed_ns == last_timestamp_ns { + draining_seconds -= 1; + if draining_seconds <= 0 { + return; // EOF + } + } else { + last_timestamp_ns = last_processed_ns; + draining_seconds = idle_timeout as i64; + } + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new(Box::pin(stream))) + } + + async fn volume_tail_receiver( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Check volume exists + { + let store = self.state.store.read().unwrap(); + store.find_volume(vid).ok_or_else(|| { + Status::not_found(format!("receiver not found volume id {}", vid)) + })?; + } + + // Parse source address and connect + let source = &req.source_volume_server; + let grpc_addr = parse_grpc_address(source) + .map_err(|e| Status::internal(format!("invalid source address {}: {}", source, e)))?; + + let channel = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| Status::internal(format!("parse source: {}", e)))? + .connect() + .await + .map_err(|e| Status::internal(format!("connect to {}: {}", grpc_addr, e)))?; + + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + // Call VolumeTailSender on source + let mut stream = client + .volume_tail_sender(volume_server_pb::VolumeTailSenderRequest { + volume_id: req.volume_id, + since_ns: req.since_ns, + idle_timeout_seconds: req.idle_timeout_seconds, + }) + .await + .map_err(|e| Status::internal(format!("volume_tail_sender: {}", e)))? + .into_inner(); + + let state = self.state.clone(); + + // Receive needles from source and write locally + while let Some(resp) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv from tail sender: {}", e)))? + { + let needle_header = resp.needle_header; + let mut needle_body = resp.needle_body; + + if needle_header.is_empty() { + continue; + } + + // Collect all chunks if not last + if !resp.is_last_chunk { + // Need to receive remaining chunks + loop { + let chunk = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv chunk: {}", e)))? + .ok_or_else(|| Status::internal("unexpected end of tail stream"))?; + needle_body.extend_from_slice(&chunk.needle_body); + if chunk.is_last_chunk { + break; + } + } + } + + // Parse needle from header + body + let mut n = Needle::default(); + n.read_header(&needle_header); + n.read_body_v2(&needle_body) + .map_err(|e| Status::internal(format!("parse needle body: {}", e)))?; + + // Write needle to local volume + let mut store = state.store.write().unwrap(); + store + .write_volume_needle(vid, &mut n) + .map_err(|e| Status::internal(format!("write needle: {}", e)))?; + } + + Ok(Response::new( + volume_server_pb::VolumeTailReceiverResponse {}, + )) + } + + // ---- EC operations ---- + + async fn volume_ec_shards_generate( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let collection = &req.collection; + + // Find the volume's directory and validate collection + let (dir, idx_dir, vol_version, dat_file_size, expire_at_sec) = { + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + if vol.collection != req.collection { + return Err(Status::internal(format!( + "existing collection:{} unexpected input: {}", + vol.collection, req.collection + ))); + } + let version = vol.version().0 as u32; + let dat_size = vol.dat_file_size().unwrap_or(0) as i64; + let expire_at_sec = { + let ttl_seconds = vol.super_block.ttl.to_seconds(); + if ttl_seconds > 0 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + + ttl_seconds + } else { + 0 + } + }; + ( + store.locations[loc_idx].directory.clone(), + store.locations[loc_idx].idx_directory.clone(), + version, + dat_size, + expire_at_sec, + ) + }; + + // Check existing .vif for EC shard config (matching Go's MaybeLoadVolumeInfo) + let (data_shards, parity_shards) = + crate::storage::erasure_coding::ec_volume::read_ec_shard_config(&dir, collection, vid); + + if let Err(e) = crate::storage::erasure_coding::ec_encoder::write_ec_files( + &dir, + &idx_dir, + collection, + vid, + data_shards as usize, + parity_shards as usize, + ) { + // Cleanup partially-created .ecNN and .ecx files on failure (matching Go defer) + let base = crate::storage::volume::volume_file_name(&dir, collection, vid); + let total_shards = data_shards + parity_shards; + for i in 0..total_shards { + let shard_path = format!("{}.ec{:02}", base, i); + let _ = std::fs::remove_file(&shard_path); + } + let _ = std::fs::remove_file(format!("{}.ecx", base)); + return Err(Status::internal(e.to_string())); + } + + // Write .vif file with EC shard metadata + { + let base = crate::storage::volume::volume_file_name(&dir, collection, vid); + let vif_path = format!("{}.vif", base); + let vif = crate::storage::volume::VifVolumeInfo { + version: vol_version, + dat_file_size, + expire_at_sec, + ec_shard_config: Some(crate::storage::volume::VifEcShardConfig { + data_shards: data_shards, + parity_shards: parity_shards, + }), + ..Default::default() + }; + let content = serde_json::to_string_pretty(&vif) + .map_err(|e| Status::internal(format!("serialize vif: {}", e)))?; + std::fs::write(&vif_path, content) + .map_err(|e| Status::internal(format!("write vif: {}", e)))?; + } + + Ok(Response::new( + volume_server_pb::VolumeEcShardsGenerateResponse {}, + )) + } + + async fn volume_ec_shards_rebuild( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let collection = &req.collection; + + // Search ALL locations for shards, pick the best rebuild location + // (most shards + has .ecx), collect additional dirs. + // Matches Go's multi-location search in VolumeEcShardsRebuild. + let base_name = if collection.is_empty() { + format!("{}", vid.0) + } else { + format!("{}_{}", collection, vid.0) + }; + + struct LocInfo { + dir: String, + idx_dir: String, + shard_count: usize, + has_ecx: bool, + } + + let store = self.state.store.read().unwrap(); + let mut loc_infos: Vec = Vec::new(); + + for loc in &store.locations { + // Count shards in this location's directory + let mut shard_count = 0usize; + if let Ok(entries) = std::fs::read_dir(&loc.directory) { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if name.starts_with(&format!("{}.ec", base_name)) { + let suffix = &name[base_name.len() + 3..]; + if suffix.len() == 2 && suffix.chars().all(|c| c.is_ascii_digit()) { + shard_count += 1; + } + } + } + } + + // Check for .ecx in idx_directory first, then data directory + let idx_base = format!("{}/{}", loc.idx_directory, base_name); + let data_base = format!("{}/{}", loc.directory, base_name); + let has_ecx = std::path::Path::new(&format!("{}.ecx", idx_base)).exists() + || (loc.idx_directory != loc.directory + && std::path::Path::new(&format!("{}.ecx", data_base)).exists()); + + if shard_count == 0 && !has_ecx { + continue; + } + + loc_infos.push(LocInfo { + dir: loc.directory.clone(), + idx_dir: loc.idx_directory.clone(), + shard_count, + has_ecx, + }); + } + drop(store); + + if loc_infos.is_empty() { + return Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: vec![], + }, + )); + } + + // Pick rebuild location: has .ecx and most shards + let mut rebuild_loc_idx: Option = None; + let mut other_dirs: Vec = Vec::new(); + + for (i, info) in loc_infos.iter().enumerate() { + if info.has_ecx + && (rebuild_loc_idx.is_none() + || info.shard_count > loc_infos[rebuild_loc_idx.unwrap()].shard_count) + { + if let Some(prev) = rebuild_loc_idx { + other_dirs.push(loc_infos[prev].dir.clone()); + } + rebuild_loc_idx = Some(i); + } else { + other_dirs.push(info.dir.clone()); + } + } + + let rebuild_loc_idx = match rebuild_loc_idx { + Some(i) => i, + None => { + return Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: vec![], + }, + )); + } + }; + + let rebuild_dir = loc_infos[rebuild_loc_idx].dir.clone(); + let rebuild_idx_dir = loc_infos[rebuild_loc_idx].idx_dir.clone(); + + // Determine data/parity shard config from rebuild dir + let (data_shards, parity_shards) = + crate::storage::erasure_coding::ec_volume::read_ec_shard_config( + &rebuild_dir, + collection, + vid, + ); + let total_shards = data_shards + parity_shards; + + // Check which shards are missing (check rebuild dir and all other dirs) + let mut missing: Vec = Vec::new(); + for shard_id in 0..total_shards as u8 { + let shard = crate::storage::erasure_coding::ec_shard::EcVolumeShard::new( + &rebuild_dir, + collection, + vid, + shard_id, + ); + let mut found = std::path::Path::new(&shard.file_name()).exists(); + if !found { + for other_dir in &other_dirs { + let other_shard = crate::storage::erasure_coding::ec_shard::EcVolumeShard::new( + other_dir, collection, vid, shard_id, + ); + if std::path::Path::new(&other_shard.file_name()).exists() { + found = true; + break; + } + } + } + if !found { + missing.push(shard_id as u32); + } + } + + if missing.is_empty() { + return Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: vec![], + }, + )); + } + + // Rebuild missing shards, searching all locations for input shards + crate::storage::erasure_coding::ec_encoder::rebuild_ec_files( + &rebuild_dir, + collection, + vid, + &missing, + data_shards as usize, + parity_shards as usize, + ) + .map_err(|e| Status::internal(format!("RebuildEcFiles: {}", e)))?; + + // Rebuild .ecx; use idx_directory with fallback to data directory + let ecx_base = format!("{}/{}", rebuild_idx_dir, base_name); + let ecx_rebuild_dir = if std::path::Path::new(&format!("{}.ecx", ecx_base)).exists() { + rebuild_idx_dir + } else if rebuild_idx_dir != rebuild_dir { + rebuild_dir.clone() + } else { + rebuild_idx_dir + }; + + crate::storage::erasure_coding::ec_encoder::rebuild_ecx_file( + &ecx_rebuild_dir, + collection, + vid, + data_shards as usize, + ) + .map_err(|e| Status::internal(format!("RebuildEcxFile: {}", e)))?; + + Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: missing, + }, + )) + } + + async fn volume_ec_shards_copy( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Select target location matching Go's 3-tier fallback: + // When disk_id > 0: use that specific location + // When disk_id == 0 (unset): (1) location with existing EC shards, (2) any HDD, (3) any + let (dest_dir, dest_idx_dir) = { + let store = self.state.store.read().unwrap(); + let count = store.locations.len(); + + if req.disk_id > 0 { + // Explicit disk selection + if (req.disk_id as usize) >= count { + return Err(Status::invalid_argument(format!( + "invalid disk_id {}: only have {} disks", + req.disk_id, count + ))); + } + let loc = &store.locations[req.disk_id as usize]; + (loc.directory.clone(), loc.idx_directory.clone()) + } else { + // Auto-select: prefer location with existing EC shards for this volume + let loc_idx = store + .find_free_location_predicate(|loc| loc.has_ec_volume(vid)) + .or_else(|| { + // Fall back to any HDD location + store.find_free_location_predicate(|loc| { + loc.disk_type == DiskType::HardDrive + }) + }) + .or_else(|| { + // Fall back to any location + store.find_free_location_predicate(|_| true) + }); + match loc_idx { + Some(i) => { + let loc = &store.locations[i]; + (loc.directory.clone(), loc.idx_directory.clone()) + } + None => { + return Err(Status::internal("no space left".to_string())); + } + } + } + }; + + // Connect to source and copy shard files via CopyFile + let source = &req.source_data_node; + let grpc_addr = parse_grpc_address(source).map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} invalid source_data_node {}: {}", + vid, source, e + )) + })?; + + let channel = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} parse source: {}", + vid, e + )) + })? + .connect() + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} connect to {}: {}", + vid, grpc_addr, e + )) + })?; + + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + // Copy each shard + for &shard_id in &req.shard_ids { + let ext = format!(".ec{:02}", shard_id); + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ext.clone(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy {}: {}", + vid, ext, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_dir, &req.collection, vid); + format!("{}{}", base, ext) + }; + let mut file = std::fs::File::create(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv {}: {}", ext, e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + // Copy .ecx file if requested + if req.copy_ecx_file { + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ".ecx".to_string(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy .ecx: {}", + vid, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_idx_dir, &req.collection, vid); + format!("{}.ecx", base) + }; + let mut file = std::fs::File::create(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv .ecx: {}", e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + // Copy .ecj file if requested + if req.copy_ecj_file { + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ".ecj".to_string(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ignore_source_file_not_found: true, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy .ecj: {}", + vid, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_idx_dir, &req.collection, vid); + format!("{}.ecj", base) + }; + let mut file = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv .ecj: {}", e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + // Copy .vif file if requested + if req.copy_vif_file { + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ".vif".to_string(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ignore_source_file_not_found: true, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy .vif: {}", + vid, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_dir, &req.collection, vid); + format!("{}.vif", base) + }; + let mut file = std::fs::File::create(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv .vif: {}", e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + Ok(Response::new( + volume_server_pb::VolumeEcShardsCopyResponse {}, + )) + } + + async fn volume_ec_shards_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let mut store = self.state.store.write().unwrap(); + store.delete_ec_shards(vid, &req.collection, &req.shard_ids); + drop(store); + self.state.volume_state_notify.notify_one(); + Ok(Response::new( + volume_server_pb::VolumeEcShardsDeleteResponse {}, + )) + } + + async fn volume_ec_shards_mount( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Mount one shard at a time, returning error on first failure. + // Matches Go: for _, shardId := range req.ShardIds { err = vs.store.MountEcShards(...) } + let mut store = self.state.store.write().unwrap(); + for &shard_id in &req.shard_ids { + store + .mount_ec_shard(vid, &req.collection, shard_id) + .map_err(|e| { + Status::internal(format!("mount {}.{}: {}", req.volume_id, shard_id, e)) + })?; + } + drop(store); + self.state.volume_state_notify.notify_one(); + + Ok(Response::new( + volume_server_pb::VolumeEcShardsMountResponse {}, + )) + } + + async fn volume_ec_shards_unmount( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Unmount one shard at a time, returning error on first failure. + // Matches Go: for _, shardId := range req.ShardIds { err = vs.store.UnmountEcShards(...) } + let mut store = self.state.store.write().unwrap(); + for &shard_id in &req.shard_ids { + store.unmount_ec_shard(vid, shard_id).map_err(|e| { + Status::internal(format!("unmount {}.{}: {}", req.volume_id, shard_id, e)) + })?; + } + drop(store); + self.state.volume_state_notify.notify_one(); + Ok(Response::new( + volume_server_pb::VolumeEcShardsUnmountResponse {}, + )) + } + + type VolumeEcShardReadStream = BoxStream; + async fn volume_ec_shard_read( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let store = self.state.store.read().unwrap(); + let ec_vol = store.find_ec_volume(vid).ok_or_else(|| { + Status::not_found(format!( + "ec volume {} shard {} not found", + req.volume_id, req.shard_id + )) + })?; + + // Check if the requested needle is deleted (via .ecx index, matching Go) + if req.file_key > 0 { + let needle_id = NeedleId(req.file_key); + if let Some((_offset, size)) = ec_vol + .find_needle_from_ecx(needle_id) + .map_err(|e| Status::internal(e.to_string()))? + { + if size.is_deleted() { + let results = vec![Ok(volume_server_pb::VolumeEcShardReadResponse { + is_deleted: true, + ..Default::default() + })]; + return Ok(Response::new(Box::pin(tokio_stream::iter(results)))); + } + } + } + + // Read from the shard + let shard = ec_vol + .shards + .get(req.shard_id as usize) + .and_then(|s| s.as_ref()) + .ok_or_else(|| { + Status::not_found(format!( + "ec volume {} shard {} not mounted", + req.volume_id, req.shard_id + )) + })?; + + let total_size = if req.size > 0 { + req.size as usize + } else { + 1024 * 1024 + }; + + // Stream in 2MB chunks (matching Go's BufferSizeLimit) + const BUFFER_SIZE_LIMIT: usize = 2 * 1024 * 1024; + let mut results: Vec> = + Vec::new(); + let mut bytes_read: usize = 0; + let mut current_offset = req.offset as u64; + + while bytes_read < total_size { + let chunk_size = std::cmp::min(BUFFER_SIZE_LIMIT, total_size - bytes_read); + let mut buf = vec![0u8; chunk_size]; + let n = shard + .read_at(&mut buf, current_offset) + .map_err(|e| Status::internal(e.to_string()))?; + if n == 0 { + break; + } + buf.truncate(n); + bytes_read += n; + current_offset += n as u64; + results.push(Ok(volume_server_pb::VolumeEcShardReadResponse { + data: buf, + is_deleted: false, + })); + if n < chunk_size { + break; // short read means EOF + } + } + + Ok(Response::new(Box::pin(tokio_stream::iter(results)))) + } + + async fn volume_ec_blob_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.file_key); + + // Go checks if needle is already deleted (via ecx) before journaling. + // Search all locations for the EC volume. + let mut store = self.state.store.write().unwrap(); + if let Some(ec_vol) = store.find_ec_volume_mut(vid) { + // Check if already deleted via ecx index + if let Ok(Some((_offset, size))) = ec_vol.find_needle_from_ecx(needle_id) { + if size.is_deleted() { + // Already deleted, no-op + return Ok(Response::new( + volume_server_pb::VolumeEcBlobDeleteResponse {}, + )); + } + } + ec_vol + .journal_delete(needle_id) + .map_err(|e| Status::internal(e.to_string()))?; + } + // If EC volume not mounted, it's a no-op (matching Go behavior) + Ok(Response::new( + volume_server_pb::VolumeEcBlobDeleteResponse {}, + )) + } + + async fn volume_ec_shards_to_volume( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let store = self.state.store.read().unwrap(); + let ec_vol = store + .find_ec_volume(vid) + .ok_or_else(|| Status::not_found(format!("ec volume {} not found", req.volume_id)))?; + + if ec_vol.collection != req.collection { + return Err(Status::internal(format!( + "existing collection:{} unexpected input: {}", + ec_vol.collection, req.collection + ))); + } + + // Use EC context data shard count from the volume + let data_shards = ec_vol.data_shards as usize; + + // Validate data shard count range (matches Go's VolumeEcShardsToVolume) + let max_shard_count = crate::storage::erasure_coding::ec_shard::MAX_SHARD_COUNT; + if data_shards == 0 || data_shards > max_shard_count { + return Err(Status::invalid_argument(format!( + "invalid data shard count {} for volume {} (must be 1..{})", + data_shards, req.volume_id, max_shard_count + ))); + } + + // Check that all data shards are present + for shard_id in 0..data_shards { + if ec_vol + .shards + .get(shard_id) + .map(|s| s.is_none()) + .unwrap_or(true) + { + return Err(Status::internal(format!( + "ec volume {} missing shard {}", + req.volume_id, shard_id + ))); + } + } + + // Read the .ecx index to check for live entries + let ecx_path = ec_vol.ecx_file_name(); + let ecx_data = + std::fs::read(&ecx_path).map_err(|e| Status::internal(format!("read ecx: {}", e)))?; + let entry_count = ecx_data.len() / NEEDLE_MAP_ENTRY_SIZE; + + let mut has_live = false; + for i in 0..entry_count { + let start = i * NEEDLE_MAP_ENTRY_SIZE; + let (_, _, size) = + idx_entry_from_bytes(&ecx_data[start..start + NEEDLE_MAP_ENTRY_SIZE]); + if !size.is_deleted() { + has_live = true; + break; + } + } + + if !has_live { + return Err(Status::failed_precondition(format!( + "ec volume {} has no live entries", + req.volume_id + ))); + } + + // Reconstruct the volume from EC shards + let dir = ec_vol.dir.clone(); + let collection = ec_vol.collection.clone(); + drop(store); + + // Calculate .dat file size from .ecx entries + let dat_file_size = + crate::storage::erasure_coding::ec_decoder::find_dat_file_size(&dir, &collection, vid) + .map_err(|e| Status::internal(format!("FindDatFileSize: {}", e)))?; + + // Write .dat file using block-interleaved reading from shards + crate::storage::erasure_coding::ec_decoder::write_dat_file_from_shards( + &dir, + &collection, + vid, + dat_file_size, + data_shards, + ) + .map_err(|e| Status::internal(format!("WriteDatFile: {}", e)))?; + + // Write .idx file from .ecx and .ecj files + crate::storage::erasure_coding::ec_decoder::write_idx_file_from_ec_index( + &dir, + &collection, + vid, + ) + .map_err(|e| Status::internal(format!("WriteIdxFileFromEcIndex: {}", e)))?; + + // Go does NOT unmount EC shards or mount the volume here. + // The caller (ec.balance / ec.decode) handles mount/unmount separately. + + Ok(Response::new( + volume_server_pb::VolumeEcShardsToVolumeResponse {}, + )) + } + + async fn volume_ec_shards_info( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let store = self.state.store.read().unwrap(); + let ec_vol = store + .find_ec_volume(vid) + .ok_or_else(|| Status::not_found(format!("ec volume {} not found", req.volume_id)))?; + + let mut shard_infos = Vec::new(); + for (i, shard) in ec_vol.shards.iter().enumerate() { + match shard { + Some(s) => { + shard_infos.push(volume_server_pb::EcShardInfo { + shard_id: i as u32, + size: s.file_size(), + collection: ec_vol.collection.clone(), + volume_id: req.volume_id, + }); + } + None => { + shard_infos.push(volume_server_pb::EcShardInfo { + shard_id: i as u32, + collection: ec_vol.collection.clone(), + volume_id: req.volume_id, + ..Default::default() + }); + } + } + } + + // Walk .ecx index to compute file counts and total size (matching Go's WalkIndex) + let (file_count, file_deleted_count, volume_size) = ec_vol + .walk_ecx_stats() + .map_err(|e| Status::internal(e.to_string()))?; + + Ok(Response::new( + volume_server_pb::VolumeEcShardsInfoResponse { + ec_shard_infos: shard_infos, + volume_size, + file_count, + file_deleted_count, + }, + )) + } + + // ---- Tiered storage ---- + + type VolumeTierMoveDatToRemoteStream = + BoxStream; + async fn volume_tier_move_dat_to_remote( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Validate volume exists and collection matches + let dat_path = { + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", req.volume_id)))?; + + if vol.collection != req.collection { + return Err(Status::invalid_argument(format!( + "existing collection:{} unexpected input: {}", + vol.collection, req.collection + ))); + } + + let dat_path = vol.dat_path(); + + // Match Go's DiskFile check: if the .dat file is still local, we can + // keep tiering it even when remote file entries already exist. + if volume_is_remote_only(&dat_path, vol.has_remote_file) { + // Already on remote -- return empty stream (matches Go: returns nil) + let stream = tokio_stream::empty(); + return Ok(Response::new( + Box::pin(stream) as Self::VolumeTierMoveDatToRemoteStream + )); + } + + // Check if the destination backend already exists in volume info + let (backend_type, backend_id) = + crate::remote_storage::s3_tier::backend_name_to_type_id( + &req.destination_backend_name, + ); + for rf in &vol.volume_info.files { + if rf.backend_type == backend_type && rf.backend_id == backend_id { + return Err(Status::already_exists(format!( + "destination {} already exists", + req.destination_backend_name + ))); + } + } + + dat_path + }; + + // Look up the S3 tier backend + let backend = { + let registry = self.state.s3_tier_registry.read().unwrap(); + registry.get(&req.destination_backend_name).ok_or_else(|| { + let keys = registry.names(); + Status::not_found(format!( + "destination {} not found, supported: {:?}", + req.destination_backend_name, keys + )) + })? + }; + + let (backend_type, backend_id) = + crate::remote_storage::s3_tier::backend_name_to_type_id(&req.destination_backend_name); + + let (tx, rx) = tokio::sync::mpsc::channel::< + Result, + >(16); + let state = self.state.clone(); + let keep_local = req.keep_local_dat_file; + let dest_backend_name = req.destination_backend_name.clone(); + + tokio::spawn(async move { + let result: Result<(), Status> = async { + // Upload the .dat file to S3 with progress + let tx_progress = tx.clone(); + let mut last_report = std::time::Instant::now(); + let (key, size) = backend + .upload_file(&dat_path, move |processed, percentage| { + let now = std::time::Instant::now(); + if now.duration_since(last_report) >= std::time::Duration::from_secs(1) { + last_report = now; + let _ = tx_progress.try_send(Ok( + volume_server_pb::VolumeTierMoveDatToRemoteResponse { + processed, + processed_percentage: percentage, + }, + )); + } + }) + .await + .map_err(|e| { + Status::internal(format!( + "backend {} copy file {}: {}", + dest_backend_name, dat_path, e + )) + })?; + + // Update volume info with remote file reference + { + let mut store = state.store.write().unwrap(); + if let Some((_, vol)) = store.find_volume_mut(vid) { + let now_unix = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + vol.volume_info.files.push(volume_server_pb::RemoteFile { + backend_type: backend_type.clone(), + backend_id: backend_id.clone(), + key, + offset: 0, + file_size: size, + modified_time: now_unix, + extension: ".dat".to_string(), + }); + vol.refresh_remote_write_mode(); + + if let Err(e) = vol.save_volume_info() { + return Err(Status::internal(format!( + "volume {} failed to save remote file info: {}", + vid, e + ))); + } + + // Close local dat file handle (matches Go's v.LoadRemoteFile + // which closes DataBackend before switching to remote) + vol.close_local_dat_backend(); + + // Optionally remove local .dat file from disk + if !keep_local { + let dat = vol.dat_path(); + let _ = std::fs::remove_file(&dat); + } + } + } + + // Go does NOT send a final 100% progress message after upload completion + Ok(()) + } + .await; + + if let Err(e) = result { + let _ = tx.send(Err(e)).await; + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new( + Box::pin(stream) as Self::VolumeTierMoveDatToRemoteStream + )) + } + + type VolumeTierMoveDatFromRemoteStream = + BoxStream; + async fn volume_tier_move_dat_from_remote( + &self, + request: Request, + ) -> Result, Status> { + // Note: Go does NOT check maintenance mode for TierMoveDatFromRemote + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Validate volume and get remote storage info + let (dat_path, storage_name, storage_key) = { + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", req.volume_id)))?; + + if vol.collection != req.collection { + return Err(Status::invalid_argument(format!( + "existing collection:{} unexpected input: {}", + vol.collection, req.collection + ))); + } + + let (storage_name, storage_key) = vol.remote_storage_name_key(); + if storage_name.is_empty() || storage_key.is_empty() { + return Err(Status::failed_precondition(format!( + "volume {} is already on local disk", + vid + ))); + } + + // Check if the dat file already exists locally (matches Go's DataBackend DiskFile check) + let dat_path = vol.dat_path(); + if std::path::Path::new(&dat_path).exists() { + return Err(Status::failed_precondition(format!( + "volume {} is already on local disk", + vid + ))); + } + + (dat_path, storage_name, storage_key) + }; + + // Look up the S3 tier backend + let backend = { + let registry = self.state.s3_tier_registry.read().unwrap(); + registry.get(&storage_name).ok_or_else(|| { + let keys = registry.names(); + Status::not_found(format!( + "remote storage {} not found from supported: {:?}", + storage_name, keys + )) + })? + }; + + let (tx, rx) = tokio::sync::mpsc::channel::< + Result, + >(16); + let state = self.state.clone(); + let keep_remote = req.keep_remote_dat_file; + + tokio::spawn(async move { + let result: Result<(), Status> = async { + // Download the .dat file from S3 with progress + let tx_progress = tx.clone(); + let mut last_report = std::time::Instant::now(); + let storage_name_clone = storage_name.clone(); + let _size = backend + .download_file(&dat_path, &storage_key, move |processed, percentage| { + let now = std::time::Instant::now(); + if now.duration_since(last_report) >= std::time::Duration::from_secs(1) { + last_report = now; + let _ = tx_progress.try_send(Ok( + volume_server_pb::VolumeTierMoveDatFromRemoteResponse { + processed, + processed_percentage: percentage, + }, + )); + } + }) + .await + .map_err(|e| { + Status::internal(format!( + "backend {} copy file {}: {}", + storage_name_clone, dat_path, e + )) + })?; + + if !keep_remote { + // Delete remote file + backend.delete_file(&storage_key).await.map_err(|e| { + Status::internal(format!( + "volume {} failed to delete remote file {}: {}", + vid, storage_key, e + )) + })?; + + // Update volume info: remove remote file reference + { + let mut store = state.store.write().unwrap(); + if let Some((_, vol)) = store.find_volume_mut(vid) { + if !vol.volume_info.files.is_empty() { + vol.volume_info.files.remove(0); + } + vol.refresh_remote_write_mode(); + + if let Err(e) = vol.save_volume_info() { + return Err(Status::internal(format!( + "volume {} failed to save remote file info: {}", + vid, e + ))); + } + + // Close old remote backend (matches Go: v.DataBackend.Close(); v.DataBackend = nil) + // This forces the next read to discover and open the local .dat file. + vol.close_remote_dat_backend(); + } + } + } + + // Go does NOT send a final 100% progress message after download completion + Ok(()) + } + .await; + + if let Err(e) = result { + let _ = tx.send(Err(e)).await; + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new( + Box::pin(stream) as Self::VolumeTierMoveDatFromRemoteStream + )) + } + + // ---- Server management ---- + + async fn volume_server_status( + &self, + _request: Request, + ) -> Result, Status> { + let store = self.state.store.read().unwrap(); + + let mut disk_statuses = Vec::new(); + for loc in &store.locations { + let (all, free) = get_disk_usage(&loc.directory); + let used = all.saturating_sub(free); + let percent_free = if all > 0 { + ((free as f64 / all as f64) * 100.0) as f32 + } else { + 0.0 + }; + let percent_used = if all > 0 { + ((used as f64 / all as f64) * 100.0) as f32 + } else { + 0.0 + }; + disk_statuses.push(volume_server_pb::DiskStatus { + dir: loc.directory.clone(), + all, + used, + free, + percent_free, + percent_used, + disk_type: loc.disk_type.to_string(), + }); + } + + Ok(Response::new( + volume_server_pb::VolumeServerStatusResponse { + disk_statuses, + memory_status: Some(super::memory_status::collect_mem_status()), + version: crate::version::full_version().to_string(), + data_center: self.state.data_center.clone(), + rack: self.state.rack.clone(), + state: Some(volume_server_pb::VolumeServerState { + maintenance: self.state.maintenance.load(Ordering::Relaxed), + version: self.state.state_version.load(Ordering::Relaxed), + }), + }, + )) + } + + async fn volume_server_leave( + &self, + _request: Request, + ) -> Result, Status> { + *self.state.is_stopping.write().unwrap() = true; + self.state.is_heartbeating.store(false, Ordering::Relaxed); + // Wake heartbeat loop to send deregistration. + self.state.volume_state_notify.notify_one(); + Ok(Response::new( + volume_server_pb::VolumeServerLeaveResponse {}, + )) + } + + async fn fetch_and_write_needle( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Check volume exists + { + let store = self.state.store.read().unwrap(); + store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + } + + // Get remote storage configuration + let remote_conf = req + .remote_conf + .as_ref() + .ok_or_else(|| Status::invalid_argument("remote storage configuration is required"))?; + + // Create remote storage client + let client = + crate::remote_storage::make_remote_storage_client(remote_conf).map_err(|e| { + Status::internal(format!( + "get remote client: make remote storage client {}: {}", + remote_conf.name, e, + )) + })?; + + let remote_location = req + .remote_location + .as_ref() + .ok_or_else(|| Status::invalid_argument("remote storage location is required"))?; + + // Read data from remote storage + let data = client + .read_file(remote_location, req.offset, req.size) + .await + .map_err(|e| { + Status::internal(format!("read from remote {:?}: {}", remote_location, e)) + })?; + + // Build needle and write locally + let mut n = Needle { + id: NeedleId(req.needle_id), + cookie: Cookie(req.cookie), + data_size: data.len() as u32, + data: data.clone(), + ..Needle::default() + }; + n.checksum = crate::storage::needle::crc::CRC::new(&n.data); + n.size = crate::storage::types::Size(4 + n.data_size as i32 + 1); + n.last_modified = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + n.set_has_last_modified_date(); + + // Run local write and replica writes concurrently (matches Go's WaitGroup) + let mut handles: Vec>> = Vec::new(); + + // Spawn local write as a concurrent task + let state_clone = self.state.clone(); + let mut n_clone = n.clone(); + let needle_id = req.needle_id; + let size = req.size; + let local_handle = tokio::task::spawn_blocking(move || { + let mut store = state_clone.store.write().unwrap(); + store + .write_volume_needle(vid, &mut n_clone) + .map(|_| ()) + .map_err(|e| format!("local write needle {} size {}: {}", needle_id, size, e)) + }); + + // Spawn replica writes concurrently + if !req.replicas.is_empty() { + let file_id = format!("{},{:x}{:08x}", vid, req.needle_id, req.cookie); + let http_client = self.state.http_client.clone(); + let scheme = self.state.outgoing_http_scheme.clone(); + for replica in &req.replicas { + let raw_target = format!("{}/{}?type=replicate", replica.url, file_id); + let url = + crate::server::volume_server::normalize_outgoing_http_url(&scheme, &raw_target) + .map_err(Status::internal)?; + let data_clone = data.clone(); + let client_clone = http_client.clone(); + let needle_id = req.needle_id; + let size = req.size; + handles.push(tokio::spawn(async move { + let form = reqwest::multipart::Form::new() + .part("file", reqwest::multipart::Part::bytes(data_clone)); + client_clone + .post(&url) + .multipart(form) + .send() + .await + .map(|_| ()) + .map_err(|e| { + format!("remote write needle {} size {}: {}", needle_id, size, e) + }) + })); + } + } + + // Await ALL writes before checking errors (matches Go's wg.Wait()) + let local_result = local_handle.await; + let mut replica_results = Vec::new(); + for handle in handles { + replica_results.push(handle.await); + } + + // Check local write result + match local_result { + Ok(Ok(())) => {} + Ok(Err(e)) => return Err(Status::internal(e)), + Err(e) => return Err(Status::internal(format!("local write task failed: {}", e))), + } + + let e_tag = n.etag(); + + // Check replica write results + for result in replica_results { + match result { + Ok(Ok(())) => {} + Ok(Err(e)) => return Err(Status::internal(e)), + Err(e) => return Err(Status::internal(format!("replication task failed: {}", e))), + } + } + + Ok(Response::new( + volume_server_pb::FetchAndWriteNeedleResponse { e_tag }, + )) + } + + async fn scrub_volume( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Validate mode + let mode = req.mode; + match mode { + 1 | 2 | 3 => {} // INDEX=1, FULL=2, LOCAL=3 + _ => { + return Err(Status::invalid_argument(format!( + "unsupported volume scrub mode {}", + mode + ))) + } + } + + let mut total_volumes: u64 = 0; + let mut total_files: u64 = 0; + let mut broken_volume_ids: Vec = Vec::new(); + let mut details: Vec = Vec::new(); + let mut broken_vids: Vec = Vec::new(); + + // Scrub phase: hold store read lock, then drop before async readonly calls. + { + let store = self.state.store.read().unwrap(); + let vids: Vec = if req.volume_ids.is_empty() { + store.all_volume_ids() + } else { + req.volume_ids.iter().map(|&id| VolumeId(id)).collect() + }; + + for vid in &vids { + let (_, v) = store + .find_volume(*vid) + .ok_or_else(|| Status::not_found(format!("volume id {} not found", vid.0)))?; + total_volumes += 1; + + // INDEX mode (1) calls scrub_index; LOCAL (2) and FULL (3) call scrub + let scrub_result = if mode == 1 { + v.scrub_index() + } else { + v.scrub() + }; + match scrub_result { + Ok((files, broken)) => { + total_files += files; + if !broken.is_empty() { + broken_vids.push(*vid); + broken_volume_ids.push(vid.0); + for msg in broken { + details.push(format!("vol {}: {}", vid.0, msg)); + } + } + } + Err(e) => { + total_files += v.file_count().max(0) as u64; + broken_vids.push(*vid); + broken_volume_ids.push(vid.0); + details.push(format!("vol {}: scrub error: {}", vid.0, e)); + } + } + } + } // store lock dropped here + + // Match Go: if mark_broken_volumes_readonly, call makeVolumeReadonly on each broken volume. + // Collect errors via errors.Join semantics (return joined error if any fail). + if req.mark_broken_volumes_readonly { + let mut errs: Vec = Vec::new(); + for vid in &broken_vids { + match self.make_volume_readonly(*vid, true).await { + Ok(()) => { + details.push(format!("volume {} is now read-only", vid.0)); + } + Err(e) => { + errs.push(e.message().to_string()); + details.push(e.message().to_string()); + } + } + } + if !errs.is_empty() { + return Err(Status::internal(errs.join("\n"))); + } + } + + Ok(Response::new(volume_server_pb::ScrubVolumeResponse { + total_volumes, + total_files, + broken_volume_ids, + details, + })) + } + + async fn scrub_ec_volume( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Validate mode + let mode = req.mode; + match mode { + 1 | 2 | 3 => {} // INDEX=1, FULL=2, LOCAL=3 + _ => { + return Err(Status::invalid_argument(format!( + "unsupported EC volume scrub mode {}", + mode + ))) + } + } + + let store = self.state.store.read().unwrap(); + let vids: Vec = if req.volume_ids.is_empty() { + store + .locations + .iter() + .flat_map(|loc| loc.ec_volumes().map(|(vid, _)| *vid)) + .collect() + } else { + req.volume_ids.iter().map(|&id| VolumeId(id)).collect() + }; + + let mut total_volumes: u64 = 0; + let mut total_files: u64 = 0; + let mut broken_volume_ids: Vec = Vec::new(); + let mut broken_shard_infos: Vec = Vec::new(); + let mut details: Vec = Vec::new(); + + for vid in &vids { + let ecv = store + .find_ec_volume(*vid) + .ok_or_else(|| Status::not_found(format!("EC volume id {} not found", vid.0)))?; + let collection = ecv.collection.clone(); + + match mode { + 1 => { + // INDEX mode: check ecx index integrity only, no shard verification + // Matches Go's v.ScrubIndex() → idx.CheckIndexFile() + let (count, errs) = ecv.scrub_index(); + total_volumes += 1; + total_files += count; + if !errs.is_empty() { + broken_volume_ids.push(vid.0); + for msg in errs { + details.push(format!("ecvol {}: {}", vid.0, msg)); + } + } + } + 2 | 3 => { + // LOCAL (2) / FULL (3): verify EC shard data + let files = ecv.walk_ecx_stats().map(|(f, _, _)| f).unwrap_or(0); + + let dir = store + .find_ec_dir(*vid, &collection) + .unwrap_or_else(|| String::from("")); + if dir.is_empty() { + continue; + } + + total_volumes += 1; + total_files += files; + let (data_shards, parity_shards) = + crate::storage::erasure_coding::ec_volume::read_ec_shard_config( + &dir, + &collection, + *vid, + ); + + match crate::storage::erasure_coding::ec_encoder::verify_ec_shards( + &dir, + &collection, + *vid, + data_shards as usize, + parity_shards as usize, + ) { + Ok((broken, msgs)) => { + if !broken.is_empty() { + broken_volume_ids.push(vid.0); + for b in broken { + broken_shard_infos.push(volume_server_pb::EcShardInfo { + volume_id: vid.0, + collection: collection.clone(), + shard_id: b, + ..Default::default() + }); + } + } + for msg in msgs { + details.push(format!("ecvol {}: {}", vid.0, msg)); + } + } + Err(e) => { + broken_volume_ids.push(vid.0); + details.push(format!("ecvol {}: scrub error: {}", vid.0, e)); + } + } + } + _ => unreachable!(), // validated above + } + } + + Ok(Response::new(volume_server_pb::ScrubEcVolumeResponse { + total_volumes, + total_files, + broken_volume_ids, + broken_shard_infos, + details, + })) + } + + type QueryStream = BoxStream; + async fn query( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut stripes: Vec> = Vec::new(); + + for fid_str in &req.from_file_ids { + let file_id = needle::FileId::parse(fid_str).map_err(|e| Status::internal(e))?; + + let mut n = Needle { + id: file_id.key, + cookie: file_id.cookie, + ..Needle::default() + }; + let original_cookie = n.cookie; + + let store = self.state.store.read().unwrap(); + store + .read_volume_needle(file_id.volume_id, &mut n) + .map_err(|e| Status::internal(e.to_string()))?; + drop(store); + + // Cookie mismatch: log and return empty stream (matching Go behavior where err is nil) + if n.cookie != original_cookie { + tracing::info!( + "volume query failed to read fid cookie {}: cookie mismatch", + fid_str + ); + let stream = tokio_stream::iter(stripes); + return Ok(Response::new(Box::pin(stream))); + } + + let input = req.input_serialization.as_ref(); + + // CSV input: no output (Go does nothing for CSV) + if input.map_or(false, |i| i.csv_input.is_some()) { + // No stripes emitted for CSV + continue; + } + + // JSON input: process lines + if input.map_or(false, |i| i.json_input.is_some()) { + let filter = req.filter.as_ref(); + let data_str = String::from_utf8_lossy(&n.data); + let mut records: Vec = Vec::new(); + + for line in data_str.lines() { + if line.trim().is_empty() { + continue; + } + let parsed: serde_json::Value = match serde_json::from_str(line) { + Ok(v) => v, + Err(_) => continue, + }; + + // Apply filter + if let Some(f) = filter { + if !f.field.is_empty() && !f.operand.is_empty() { + let field_val = &parsed[&f.field]; + let pass = match f.operand.as_str() { + ">" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv > tv + } else { + false + } + } + ">=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv >= tv + } else { + false + } + } + "<" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv < tv + } else { + false + } + } + "<=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv <= tv + } else { + false + } + } + "=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv == tv + } else { + field_val.as_str().map_or(false, |s| s == f.value) + } + } + "!=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv != tv + } else { + field_val.as_str().map_or(true, |s| s != f.value) + } + } + _ => true, + }; + if !pass { + continue; + } + } + } + + // Build output record: {selection:value,...} (Go's ToJson format — unquoted keys) + records.push(b'{'); + for (i, sel) in req.selections.iter().enumerate() { + if i > 0 { + records.push(b','); + } + records.extend_from_slice(sel.as_bytes()); + records.push(b':'); + let val = &parsed[sel]; + let raw = if val.is_null() { + "null".to_string() + } else { + // Use the raw JSON representation + val.to_string() + }; + records.extend_from_slice(raw.as_bytes()); + } + records.push(b'}'); + } + + stripes.push(Ok(volume_server_pb::QueriedStripe { records })); + } + } + + let stream = tokio_stream::iter(stripes); + Ok(Response::new(Box::pin(stream))) + } + + async fn volume_needle_status( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.needle_id); + + let store = self.state.store.read().unwrap(); + + // Try normal volume first + if let Some(_) = store.find_volume(vid) { + let mut n = Needle { + id: needle_id, + ..Needle::default() + }; + match store.read_volume_needle(vid, &mut n) { + Ok(_) => { + let ttl_str = n.ttl.as_ref().map_or(String::new(), |t| t.to_string()); + return Ok(Response::new( + volume_server_pb::VolumeNeedleStatusResponse { + needle_id: n.id.0, + cookie: n.cookie.0, + size: n.size.0 as u32, + last_modified: n.last_modified, + crc: n.checksum.0, + ttl: ttl_str, + }, + )); + } + Err(_) => return Err(Status::not_found(format!("needle not found {}", needle_id))), + } + } + + // Fall back to EC shards — read full needle from local shards + if let Some(ec_vol) = store.find_ec_volume(vid) { + match ec_vol.read_ec_shard_needle(needle_id) { + Ok(Some(n)) => { + let ttl_str = match &n.ttl { + Some(t) if n.has_ttl() => t.to_string(), + _ => String::new(), + }; + return Ok(Response::new( + volume_server_pb::VolumeNeedleStatusResponse { + needle_id: n.id.0, + cookie: n.cookie.0, + size: n.size.0 as u32, + last_modified: n.last_modified, + crc: n.checksum.0, + ttl: ttl_str, + }, + )); + } + Ok(None) => { + return Err(Status::not_found(format!("needle not found {}", needle_id))); + } + Err(e) => { + return Err(Status::internal(format!( + "read ec shard needle {} from volume {}: {}", + needle_id, vid, e + ))); + } + } + } + + Err(Status::not_found(format!("volume not found {}", vid))) + } + + async fn ping( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let now_ns = || { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as i64 + }; + + let start = now_ns(); + + // Route ping based on target type (matches Go's volume_grpc_admin.go Ping) + let remote_time_ns = if req.target_type == "volumeServer" { + match ping_volume_server_target(&req.target, self.state.outgoing_grpc_tls.as_ref()) + .await + { + Ok(t) => t, + Err(e) => { + return Err(Status::internal(format!( + "ping {} {}: {}", + req.target_type, req.target, e + ))) + } + } + } else if req.target_type == "master" { + // Connect to target master and call its Ping RPC + match ping_master_target(&req.target, self.state.outgoing_grpc_tls.as_ref()).await { + Ok(t) => t, + Err(e) => { + return Err(Status::internal(format!( + "ping {} {}: {}", + req.target_type, req.target, e + ))) + } + } + } else if req.target_type == "filer" { + match ping_filer_target(&req.target, self.state.outgoing_grpc_tls.as_ref()).await { + Ok(t) => t, + Err(e) => { + return Err(Status::internal(format!( + "ping {} {}: {}", + req.target_type, req.target, e + ))) + } + } + } else { + // Unknown target type → return 0 + 0 + }; + + let stop = now_ns(); + Ok(Response::new(volume_server_pb::PingResponse { + start_time_ns: start, + remote_time_ns, + stop_time_ns: stop, + })) + } +} + +/// Build a gRPC endpoint from a SeaweedFS server address. +fn to_grpc_endpoint( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let grpc_host_port = parse_grpc_address(target)?; + build_grpc_endpoint(&grpc_host_port, tls).map_err(|e| e.to_string()) +} + +/// Ping a remote volume server target by actually calling its Ping RPC (matches Go behavior). +async fn ping_volume_server_target( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let endpoint = to_grpc_endpoint(target, tls)?; + let channel = tokio::time::timeout(std::time::Duration::from_secs(5), endpoint.connect()) + .await + .map_err(|_| "connection timeout".to_string())? + .map_err(|e| e.to_string())?; + + let mut client = volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .ping(volume_server_pb::PingRequest { + target: String::new(), + target_type: String::new(), + }) + .await + .map_err(|e| e.to_string())?; + Ok(resp.into_inner().start_time_ns) +} + +/// Ping a remote master target by actually calling its Ping RPC (matches Go behavior). +async fn ping_master_target( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let endpoint = to_grpc_endpoint(target, tls)?; + let channel = tokio::time::timeout(std::time::Duration::from_secs(5), endpoint.connect()) + .await + .map_err(|_| "connection timeout".to_string())? + .map_err(|e| e.to_string())?; + + let mut client = master_pb::seaweed_client::SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .ping(master_pb::PingRequest { + target: String::new(), + target_type: String::new(), + }) + .await + .map_err(|e| e.to_string())?; + Ok(resp.into_inner().start_time_ns) +} + +/// Ping a remote filer target by calling its Ping RPC (matches Go behavior). +async fn ping_filer_target( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let endpoint = to_grpc_endpoint(target, tls)?; + let channel = tokio::time::timeout(std::time::Duration::from_secs(5), endpoint.connect()) + .await + .map_err(|_| "connection timeout".to_string())? + .map_err(|e| e.to_string())?; + + let mut client = filer_pb::seaweed_filer_client::SeaweedFilerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .ping(filer_pb::PingRequest::default()) + .await + .map_err(|e| e.to_string())?; + Ok(resp.into_inner().start_time_ns) +} + +/// Parse a SeaweedFS server address ("ip:port.grpcPort" or "ip:port") into a gRPC address. +fn parse_grpc_address(source: &str) -> Result { + if let Some(colon_idx) = source.rfind(':') { + let port_part = &source[colon_idx + 1..]; + if let Some(dot_idx) = port_part.rfind('.') { + // Format: "ip:port.grpcPort" + let host = &source[..colon_idx]; + let grpc_port = &port_part[dot_idx + 1..]; + grpc_port + .parse::() + .map_err(|e| format!("invalid grpc port: {}", e))?; + return Ok(format!("{}:{}", host, grpc_port)); + } + // Format: "ip:port" → grpc = port + 10000 + let port: u16 = port_part + .parse() + .map_err(|e| format!("invalid port: {}", e))?; + let grpc_port = port as u32 + 10000; + let host = &source[..colon_idx]; + return Ok(format!("{}:{}", host, grpc_port)); + } + Err(format!("cannot parse address: {}", source)) +} + +/// Set the modification time of a file from nanoseconds since Unix epoch. +fn set_file_mtime(path: &str, modified_ts_ns: i64) { + use std::time::{Duration, SystemTime}; + let ts = if modified_ts_ns >= 0 { + SystemTime::UNIX_EPOCH + Duration::from_nanos(modified_ts_ns as u64) + } else { + SystemTime::UNIX_EPOCH + }; + if let Ok(file) = std::fs::File::open(path) { + let ft = std::fs::FileTimes::new().set_accessed(ts).set_modified(ts); + let _ = file.set_times(ft); + } +} + +/// Copy a file from a remote volume server via CopyFile streaming RPC. +/// Returns the modified_ts_ns received from the source. +async fn copy_file_from_source( + client: &mut volume_server_pb::volume_server_client::VolumeServerClient, + is_ec_volume: bool, + collection: &str, + volume_id: u32, + compaction_revision: u32, + stop_offset: u64, + dest_path: &str, + ext: &str, + is_append: bool, + ignore_source_not_found: bool, + progress_tx: Option< + &tokio::sync::mpsc::Sender>, + >, + next_report_target: &mut i64, + report_interval: i64, + throttler: &mut WriteThrottler, +) -> Result +where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: http_body::Body + Send + 'static, + ::Error: Into + Send, +{ + let copy_req = volume_server_pb::CopyFileRequest { + volume_id, + ext: ext.to_string(), + compaction_revision, + stop_offset, + collection: collection.to_string(), + is_ec_volume, + ignore_source_file_not_found: ignore_source_not_found, + }; + + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + format!( + "failed to start copying volume {} {} file: {}", + volume_id, ext, e + ) + })? + .into_inner(); + + let mut file = if is_append { + std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(dest_path) + .map_err(|e| format!("open file {}: {}", dest_path, e))? + } else { + std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dest_path) + .map_err(|e| format!("open file {}: {}", dest_path, e))? + }; + + let mut progressed_bytes: i64 = 0; + let mut modified_ts_ns: i64 = 0; + + while let Some(resp) = stream + .message() + .await + .map_err(|e| format!("receiving {}: {}", dest_path, e))? + { + if resp.modified_ts_ns != 0 { + modified_ts_ns = resp.modified_ts_ns; + } + if !resp.file_content.is_empty() { + use std::io::Write; + file.write_all(&resp.file_content) + .map_err(|e| format!("write file {}: {}", dest_path, e))?; + progressed_bytes += resp.file_content.len() as i64; + throttler + .maybe_slowdown(resp.file_content.len() as i64) + .await; + + if let Some(tx) = progress_tx { + if progressed_bytes > *next_report_target { + let _ = tx + .send(Ok(volume_server_pb::VolumeCopyResponse { + last_append_at_ns: 0, + processed_bytes: progressed_bytes, + })) + .await; + *next_report_target = progressed_bytes + report_interval; + } + } + } + } + + // If source file didn't exist (no modifiedTsNs received), remove empty file + // Go only removes when !isAppend + if modified_ts_ns == 0 && !is_append { + let _ = std::fs::remove_file(dest_path); + } + + Ok(modified_ts_ns) +} + +/// Verify that a copied file has the expected size. +fn check_copy_file_size(path: &str, expected: u64) -> Result<(), Status> { + match std::fs::metadata(path) { + Ok(meta) => { + if meta.len() != expected { + Err(Status::internal(format!( + "file {} size [{}] is not same as origin file size [{}]", + path, + meta.len(), + expected + ))) + } else { + Ok(()) + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound && expected == 0 => Ok(()), + Err(e) => Err(Status::internal(format!( + "stat file {} failed: {}", + path, e + ))), + } +} + +/// Find the last append timestamp from copied .idx and .dat files. +/// Go returns (0, nil) for versions < Version3 since timestamps only exist in V3. +fn find_last_append_at_ns(idx_path: &str, dat_path: &str, version: u32) -> Option { + // Only Version3 has the append timestamp in the needle tail + if version < VERSION_3.0 as u32 { + return None; + } + use std::io::{Read, Seek, SeekFrom}; + + let mut idx_file = std::fs::File::open(idx_path).ok()?; + let idx_size = idx_file.metadata().ok()?.len(); + if idx_size == 0 || idx_size % (NEEDLE_MAP_ENTRY_SIZE as u64) != 0 { + return None; + } + + // Read the last index entry + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_file + .seek(SeekFrom::End(-(NEEDLE_MAP_ENTRY_SIZE as i64))) + .ok()?; + idx_file.read_exact(&mut buf).ok()?; + + let (_key, offset, _size) = idx_entry_from_bytes(&buf); + if offset.is_zero() { + return None; + } + + // Read needle header from .dat to get the append timestamp + let mut dat_file = std::fs::File::open(dat_path).ok()?; + let actual_offset = offset.to_actual_offset(); + + // Skip to the needle at the given offset, read header to get size + dat_file.seek(SeekFrom::Start(actual_offset as u64)).ok()?; + + // Read cookie (4) + id (8) + size (4) = 16 bytes header + let mut header = [0u8; 16]; + dat_file.read_exact(&mut header).ok()?; + let needle_size = i32::from_be_bytes([header[12], header[13], header[14], header[15]]); + if needle_size <= 0 { + return None; + } + + // Seek to tail: offset + 16 (header) + size -> checksum (4) + timestamp (8) + let tail_offset = actual_offset as u64 + 16 + needle_size as u64; + dat_file.seek(SeekFrom::Start(tail_offset)).ok()?; + + let mut tail = [0u8; 12]; // 4 bytes checksum + 8 bytes timestamp + dat_file.read_exact(&mut tail).ok()?; + + // Timestamp is the last 8 bytes, big-endian + let ts = u64::from_be_bytes([ + tail[4], tail[5], tail[6], tail[7], tail[8], tail[9], tail[10], tail[11], + ]); + if ts > 0 { + Some(ts) + } else { + None + } +} + +/// Get disk usage (total, free) in bytes for the given path. +fn get_disk_usage(path: &str) -> (u64, u64) { + use sysinfo::Disks; + let disks = Disks::new_with_refreshed_list(); + let path = std::path::Path::new(path); + // Find the disk that contains this path (longest mount point prefix match) + let mut best: Option<&sysinfo::Disk> = None; + let mut best_len = 0; + for disk in disks.list() { + let mount = disk.mount_point(); + if path.starts_with(mount) && mount.as_os_str().len() > best_len { + best_len = mount.as_os_str().len(); + best = Some(disk); + } + } + match best { + Some(disk) => (disk.total_space(), disk.available_space()), + None => (0, 0), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::MinFreeSpace; + use crate::remote_storage::s3_tier::{global_s3_tier_registry, S3TierBackend, S3TierConfig}; + use crate::security::{Guard, SigningKey}; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::store::Store; + use std::sync::RwLock; + use tempfile::TempDir; + use tokio_stream::StreamExt; + + #[test] + fn test_parse_grpc_address_with_explicit_grpc_port() { + // Format: "ip:port.grpcPort" — used by SeaweedFS for source_data_node + let result = parse_grpc_address("192.168.1.66:8080.18080").unwrap(); + assert_eq!(result, "192.168.1.66:18080"); + } + + #[test] + fn test_parse_grpc_address_with_implicit_grpc_port() { + // Format: "ip:port" — grpc port = port + 10000 + let result = parse_grpc_address("192.168.1.66:8080").unwrap(); + assert_eq!(result, "192.168.1.66:18080"); + } + + #[test] + fn test_parse_grpc_address_localhost() { + let result = parse_grpc_address("localhost:9333").unwrap(); + assert_eq!(result, "localhost:19333"); + } + + #[test] + fn test_parse_grpc_address_with_ipv4_dots() { + // Regression: naive split on '.' breaks on IP addresses + let result = parse_grpc_address("10.0.0.1:8080.18080").unwrap(); + assert_eq!(result, "10.0.0.1:18080"); + + let result = parse_grpc_address("10.0.0.1:8080").unwrap(); + assert_eq!(result, "10.0.0.1:18080"); + } + + #[test] + fn test_parse_grpc_address_invalid() { + assert!(parse_grpc_address("no-colon").is_err()); + } + + #[test] + fn test_volume_is_remote_only_requires_missing_local_dat_file() { + let temp_dir = tempfile::tempdir().unwrap(); + let dat_path = temp_dir.path().join("1.dat"); + std::fs::write(&dat_path, b"dat").unwrap(); + + assert!(!volume_is_remote_only(dat_path.to_str().unwrap(), true)); + assert!(!volume_is_remote_only(dat_path.to_str().unwrap(), false)); + + std::fs::remove_file(&dat_path).unwrap(); + + assert!(volume_is_remote_only(dat_path.to_str().unwrap(), true)); + assert!(!volume_is_remote_only(dat_path.to_str().unwrap(), false)); + } + + fn spawn_fake_s3_server(body: Vec) -> (String, tokio::sync::oneshot::Sender<()>) { + use axum::http::{header, HeaderMap, HeaderValue, StatusCode}; + use axum::routing::any; + use axum::Router; + + let body = Arc::new(body); + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + listener.set_nonblocking(true).unwrap(); + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); + + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + runtime.block_on(async move { + let app = Router::new().fallback(any(move |headers: HeaderMap| { + let body = body.clone(); + async move { + let bytes = body.as_ref(); + if let Some(range) = headers + .get(header::RANGE) + .and_then(|value| value.to_str().ok()) + { + if let Some(range_value) = range.strip_prefix("bytes=") { + let mut parts = range_value.splitn(2, '-'); + let start = parts + .next() + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + let end = parts + .next() + .and_then(|value| value.parse::().ok()) + .unwrap_or_else(|| bytes.len().saturating_sub(1)); + let start = start.min(bytes.len()); + let end = end.min(bytes.len().saturating_sub(1)); + let payload = if start > end || start >= bytes.len() { + Vec::new() + } else { + bytes[start..=end].to_vec() + }; + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_RANGE, + HeaderValue::from_str(&format!( + "bytes {}-{}/{}", + start, + end, + bytes.len() + )) + .unwrap(), + ); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&payload.len().to_string()).unwrap(), + ); + return (StatusCode::PARTIAL_CONTENT, response_headers, payload); + } + } + + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&bytes.len().to_string()).unwrap(), + ); + (StatusCode::OK, response_headers, bytes.to_vec()) + } + })); + + let listener = tokio::net::TcpListener::from_std(listener).unwrap(); + axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.await; + }) + .await + .unwrap(); + }); + }); + + (format!("http://{}", addr), shutdown_tx) + } + + fn make_remote_only_service() -> ( + VolumeGrpcService, + TempDir, + tokio::sync::oneshot::Sender<()>, + Vec, + u64, + ) { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let (dat_bytes, super_block_size) = { + let mut volume = crate::storage::volume::Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + let mut needle = Needle { + id: NeedleId(7), + cookie: Cookie(0x7788), + data: b"remote-incremental-copy".to_vec(), + data_size: "remote-incremental-copy".len() as u32, + ..Needle::default() + }; + volume.write_needle(&mut needle, true).unwrap(); + volume.sync_to_disk().unwrap(); + ( + std::fs::read(volume.file_name(".dat")).unwrap(), + volume.super_block.block_size() as u64, + ) + }; + + let dat_path = format!("{}/1.dat", dir); + std::fs::remove_file(&dat_path).unwrap(); + + let (endpoint, shutdown_tx) = spawn_fake_s3_server(dat_bytes.clone()); + global_s3_tier_registry().write().unwrap().clear(); + let tier_config = S3TierConfig { + access_key: "access".to_string(), + secret_key: "secret".to_string(), + region: "us-east-1".to_string(), + bucket: "bucket-a".to_string(), + endpoint, + storage_class: "STANDARD".to_string(), + force_path_style: true, + }; + { + let mut registry = global_s3_tier_registry().write().unwrap(); + registry.register("s3.default".to_string(), S3TierBackend::new(&tier_config)); + registry.register("s3".to_string(), S3TierBackend::new(&tier_config)); + } + + let vif = crate::storage::volume::VifVolumeInfo { + files: vec![crate::storage::volume::VifRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: dat_bytes.len() as u64, + modified_time: 123, + extension: ".dat".to_string(), + }], + version: Version::current().0 as u32, + bytes_offset: crate::storage::types::OFFSET_SIZE as u32, + dat_file_size: dat_bytes.len() as i64, + ..Default::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(Guard::new( + &[], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + )), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + crate::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: crate::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new( + crate::server::volume_server::RuntimeMetricsConfig::default(), + ), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: false, + read_buffer_size_bytes: 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }); + + ( + VolumeGrpcService { state }, + tmp, + shutdown_tx, + dat_bytes, + super_block_size, + ) + } + + fn make_local_service_with_volume( + collection: &str, + ttl: Option, + ) -> (VolumeGrpcService, TempDir) { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(1), + collection, + None, + ttl, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + { + let (_, volume) = store.find_volume_mut(VolumeId(1)).unwrap(); + let mut needle = Needle { + id: NeedleId(11), + cookie: Cookie(0x3344), + data: b"ec-generate".to_vec(), + data_size: b"ec-generate".len() as u32, + ..Needle::default() + }; + volume.write_needle(&mut needle, true).unwrap(); + volume.sync_to_disk().unwrap(); + } + + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(Guard::new( + &[], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + )), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + crate::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: crate::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new( + crate::server::volume_server::RuntimeMetricsConfig::default(), + ), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: false, + read_buffer_size_bytes: 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }); + + (VolumeGrpcService { state }, tmp) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_volume_incremental_copy_streams_remote_only_volume_data() { + let (service, _tmp, shutdown_tx, dat_bytes, super_block_size) = make_remote_only_service(); + + let response = service + .volume_incremental_copy(Request::new( + volume_server_pb::VolumeIncrementalCopyRequest { + volume_id: 1, + since_ns: 0, + }, + )) + .await + .unwrap(); + + let mut stream = response.into_inner(); + let mut copied = Vec::new(); + while let Some(message) = stream.next().await { + copied.extend_from_slice(&message.unwrap().file_content); + } + + assert_eq!(copied, dat_bytes[super_block_size as usize..]); + + let _ = shutdown_tx.send(()); + global_s3_tier_registry().write().unwrap().clear(); + } + + #[tokio::test] + async fn test_volume_ec_shards_generate_persists_expire_at_sec() { + let ttl = crate::storage::needle::ttl::TTL::read("3m").unwrap(); + let (service, tmp) = make_local_service_with_volume("ttl", Some(ttl)); + let before = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + + service + .volume_ec_shards_generate(Request::new( + volume_server_pb::VolumeEcShardsGenerateRequest { + volume_id: 1, + collection: "ttl".to_string(), + }, + )) + .await + .unwrap(); + + let vif_path = tmp.path().join("ttl_1.vif"); + let vif: crate::storage::volume::VifVolumeInfo = + serde_json::from_str(&std::fs::read_to_string(vif_path).unwrap()).unwrap(); + assert!(vif.expire_at_sec >= before + ttl.to_seconds()); + assert!(vif.expire_at_sec <= before + ttl.to_seconds() + 5); + } +} diff --git a/seaweed-volume/src/server/handlers.rs b/seaweed-volume/src/server/handlers.rs new file mode 100644 index 000000000..83e43fb67 --- /dev/null +++ b/seaweed-volume/src/server/handlers.rs @@ -0,0 +1,3913 @@ +//! HTTP handlers for volume server operations. +//! +//! Implements GET/HEAD (read), POST/PUT (write), DELETE, /status, /healthz. +//! Matches Go's volume_server_handlers_read.go, volume_server_handlers_write.go, +//! volume_server_handlers_admin.go. + +use std::collections::HashMap; +use std::future::Future; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use axum::body::Body; +use axum::extract::{Path, Query, State}; +use axum::http::{header, HeaderMap, Method, Request, StatusCode}; +use axum::response::{IntoResponse, Response}; +use serde::{Deserialize, Serialize}; + +use super::grpc_client::{build_grpc_endpoint, GRPC_MAX_MESSAGE_SIZE}; +use super::volume_server::{normalize_outgoing_http_url, VolumeServerState}; +use crate::config::ReadMode; +use crate::metrics; +use crate::pb::volume_server_pb; +use crate::storage::needle::needle::Needle; +use crate::storage::types::*; + +// ============================================================================ +// Inflight Throttle Guard +// ============================================================================ + +/// RAII guard that subtracts bytes from an atomic counter and notifies waiters on drop. +struct InflightGuard<'a> { + counter: &'a std::sync::atomic::AtomicI64, + bytes: i64, + notify: &'a tokio::sync::Notify, + metric: &'a prometheus::IntGauge, +} + +impl<'a> Drop for InflightGuard<'a> { + fn drop(&mut self) { + let new_val = self.counter.fetch_sub(self.bytes, Ordering::Relaxed) - self.bytes; + self.metric.set(new_val); + self.notify.notify_waiters(); + } +} + +/// Body wrapper that tracks download inflight bytes and releases them when dropped. +struct TrackedBody { + data: Vec, + state: Arc, + bytes: i64, +} + +impl http_body::Body for TrackedBody { + type Data = bytes::Bytes; + type Error = std::convert::Infallible; + + fn poll_frame( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll, Self::Error>>> { + if self.data.is_empty() { + return std::task::Poll::Ready(None); + } + let data = std::mem::take(&mut self.data); + std::task::Poll::Ready(Some(Ok(http_body::Frame::data(bytes::Bytes::from(data))))) + } + + fn size_hint(&self) -> http_body::SizeHint { + http_body::SizeHint::with_exact(self.data.len() as u64) + } +} + +impl Drop for TrackedBody { + fn drop(&mut self) { + let new_val = self + .state + .inflight_download_bytes + .fetch_sub(self.bytes, Ordering::Relaxed) + - self.bytes; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + self.state.download_notify.notify_waiters(); + } +} + +fn finalize_bytes_response( + status: StatusCode, + headers: HeaderMap, + data: Vec, + state: Option>, +) -> Response { + if let Some(state) = state { + let data_len = data.len() as i64; + let new_val = state + .inflight_download_bytes + .fetch_add(data_len, Ordering::Relaxed) + + data_len; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + let tracked_body = TrackedBody { + data, + state, + bytes: data_len, + }; + let body = Body::new(tracked_body); + let mut resp = Response::new(body); + *resp.status_mut() = status; + *resp.headers_mut() = headers; + resp + } else { + (status, headers, data).into_response() + } +} + +// ============================================================================ +// Streaming Body for Large Files +// ============================================================================ + +/// Threshold in bytes above which we stream needle data instead of buffering. +const STREAMING_THRESHOLD: u32 = 1024 * 1024; // 1 MB + +/// Default chunk size for streaming reads from the dat file. +const DEFAULT_STREAMING_CHUNK_SIZE: usize = 64 * 1024; // 64 KB + +/// A body that streams needle data from the dat file in chunks using pread, +/// avoiding loading the entire payload into memory at once. +struct StreamingBody { + source: crate::storage::volume::NeedleStreamSource, + data_offset: u64, + data_size: u32, + pos: usize, + chunk_size: usize, + data_file_access_control: Arc, + hold_read_lock_for_stream: bool, + _held_read_lease: Option, + /// Pending result from spawn_blocking, polled to completion. + pending: Option>>, + /// For download throttling — released on drop. + state: Option>, + tracked_bytes: i64, + /// Server state used to re-lookup needle offset if compaction occurs during streaming. + server_state: Arc, + /// Volume ID for compaction-revision re-lookup. + volume_id: crate::storage::types::VolumeId, + /// Needle ID for compaction-revision re-lookup. + needle_id: crate::storage::types::NeedleId, + /// Compaction revision at the time of the initial read; if the volume's revision + /// changes between chunks, the needle may have moved and we must re-lookup its offset. + compaction_revision: u16, +} + +impl http_body::Body for StreamingBody { + type Data = bytes::Bytes; + type Error = std::io::Error; + + fn poll_frame( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll, Self::Error>>> { + loop { + // If we have a pending read, poll it + if let Some(ref mut handle) = self.pending { + match std::pin::Pin::new(handle).poll(cx) { + std::task::Poll::Pending => return std::task::Poll::Pending, + std::task::Poll::Ready(result) => { + self.pending = None; + match result { + Ok(Ok(chunk)) => { + let len = chunk.len(); + self.pos += len; + return std::task::Poll::Ready(Some(Ok(http_body::Frame::data( + chunk, + )))); + } + Ok(Err(e)) => return std::task::Poll::Ready(Some(Err(e))), + Err(e) => { + return std::task::Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + e, + )))) + } + } + } + } + } + + let total = self.data_size as usize; + if self.pos >= total { + return std::task::Poll::Ready(None); + } + + // Check if compaction has changed the needle's disk location (Go parity: + // readNeedleDataInto re-reads the needle offset when CompactionRevision changes). + let relookup_result = { + let store = self.server_state.store.read().unwrap(); + if let Some((_, vol)) = store.find_volume(self.volume_id) { + if vol.super_block.compaction_revision != self.compaction_revision { + // Compaction occurred — re-lookup the needle's data offset + Some(vol.re_lookup_needle_data_offset(self.needle_id)) + } else { + None + } + } else { + None + } + }; + if let Some(result) = relookup_result { + match result { + Ok((new_offset, new_rev)) => { + self.data_offset = new_offset; + self.compaction_revision = new_rev; + } + Err(_) => { + return std::task::Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + "needle not found after compaction", + )))); + } + } + } + + let chunk_len = std::cmp::min(self.chunk_size, total - self.pos); + let file_offset = self.data_offset + self.pos as u64; + + let source_clone = match self.source.clone_for_read() { + Ok(source) => source, + Err(e) => return std::task::Poll::Ready(Some(Err(e))), + }; + let data_file_access_control = self.data_file_access_control.clone(); + let hold_read_lock_for_stream = self.hold_read_lock_for_stream; + + let handle = tokio::task::spawn_blocking(move || { + let _lease = if hold_read_lock_for_stream { + None + } else { + Some(data_file_access_control.read_lock()) + }; + let mut buf = vec![0u8; chunk_len]; + source_clone.read_exact_at(&mut buf, file_offset)?; + Ok::(bytes::Bytes::from(buf)) + }); + + self.pending = Some(handle); + // Loop back to poll the newly created future + } + } +} + +impl Drop for StreamingBody { + fn drop(&mut self) { + if let Some(ref st) = self.state { + let new_val = st + .inflight_download_bytes + .fetch_sub(self.tracked_bytes, Ordering::Relaxed) + - self.tracked_bytes; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + st.download_notify.notify_waiters(); + } + } +} + +// ============================================================================ +// URL Parsing +// ============================================================================ + +/// Parse volume ID and file ID from URL path. +/// Supports: "vid,fid", "vid/fid", "vid,fid.ext", "vid/fid/filename.ext" +/// Extract the file_id string (e.g., "3,01637037d6") from a URL path for JWT validation. +fn extract_file_id(path: &str) -> String { + let path = path.trim_start_matches('/'); + // Strip extension and filename after second slash + if let Some(comma) = path.find(',') { + let after_comma = &path[comma + 1..]; + let fid_part = if let Some(slash) = after_comma.find('/') { + &after_comma[..slash] + } else if let Some(dot) = after_comma.rfind('.') { + &after_comma[..dot] + } else { + after_comma + }; + // Strip "_suffix" from fid (Go does this for filenames appended with underscore) + let fid_part = if let Some(underscore) = fid_part.rfind('_') { + &fid_part[..underscore] + } else { + fid_part + }; + format!("{},{}", &path[..comma], fid_part) + } else { + path.to_string() + } +} + +fn streaming_chunk_size(read_buffer_size_bytes: usize, data_size: usize) -> usize { + std::cmp::min( + read_buffer_size_bytes.max(DEFAULT_STREAMING_CHUNK_SIZE), + data_size.max(1), + ) +} + +fn parse_url_path(path: &str) -> Option<(VolumeId, NeedleId, Cookie)> { + let path = path.trim_start_matches('/'); + + // Try "vid,fid" or "vid/fid" or "vid/fid/filename" formats + let (vid_str, fid_part) = if let Some(pos) = path.find(',') { + (&path[..pos], &path[pos + 1..]) + } else if let Some(pos) = path.find('/') { + (&path[..pos], &path[pos + 1..]) + } else { + return None; + }; + + // For fid part, strip extension from the fid (not from filename) + // "vid,fid.ext" -> fid is before dot + // "vid/fid/filename.ext" -> fid is the part before the second slash + let fid_str = if let Some(slash_pos) = fid_part.find('/') { + // "fid/filename.ext" - fid is before the slash + &fid_part[..slash_pos] + } else if let Some(dot) = fid_part.rfind('.') { + // "fid.ext" - strip extension + &fid_part[..dot] + } else { + fid_part + }; + + let vid = VolumeId::parse(vid_str).ok()?; + let (needle_id, cookie) = + crate::storage::needle::needle::parse_needle_id_cookie(fid_str).ok()?; + + Some((vid, needle_id, cookie)) +} + +// ============================================================================ +// Volume Lookup + Proxy/Redirect +// ============================================================================ + +/// A volume location returned by master lookup. +#[derive(Clone, Debug, Deserialize)] +struct VolumeLocation { + url: String, + #[serde(rename = "publicUrl")] + public_url: String, + #[serde(rename = "grpcPort", default)] + grpc_port: u32, +} + +/// Master /dir/lookup response. +#[derive(Debug, Deserialize)] +struct LookupResult { + #[serde(default)] + locations: Option>, + #[serde(default)] + error: Option, +} + +/// Look up volume locations from the master via HTTP /dir/lookup. +async fn lookup_volume( + client: &reqwest::Client, + scheme: &str, + master_url: &str, + volume_id: u32, +) -> Result, String> { + let url = normalize_outgoing_http_url( + scheme, + &format!("{}/dir/lookup?volumeId={}", master_url, volume_id), + )?; + let resp = client + .get(&url) + .send() + .await + .map_err(|e| format!("lookup request failed: {}", e))?; + let result: LookupResult = resp + .json() + .await + .map_err(|e| format!("lookup parse failed: {}", e))?; + if let Some(err) = result.error { + if !err.is_empty() { + return Err(err); + } + } + Ok(result.locations.unwrap_or_default()) +} + +fn grpc_address_for_location(location: &VolumeLocation) -> Result { + let raw = location + .url + .trim_start_matches("http://") + .trim_start_matches("https://"); + + if location.grpc_port > 0 { + let (host, _) = raw + .rsplit_once(':') + .ok_or_else(|| format!("cannot parse address: {}", location.url))?; + return Ok(format!("{}:{}", host, location.grpc_port)); + } + + if let Some(colon_idx) = raw.rfind(':') { + let port_part = &raw[colon_idx + 1..]; + if let Some(dot_idx) = port_part.rfind('.') { + let host = &raw[..colon_idx]; + let grpc_port = &port_part[dot_idx + 1..]; + grpc_port + .parse::() + .map_err(|e| format!("invalid grpc port: {}", e))?; + return Ok(format!("{}:{}", host, grpc_port)); + } + + let port: u16 = port_part + .parse() + .map_err(|e| format!("invalid port: {}", e))?; + let host = &raw[..colon_idx]; + return Ok(format!("{}:{}", host, port as u32 + 10000)); + } + + Err(format!("cannot parse address: {}", location.url)) +} + +async fn batch_delete_file_ids( + state: &VolumeServerState, + file_ids: &[String], +) -> Result<(), String> { + let mut lookup_cache: HashMap> = HashMap::new(); + let mut server_to_file_ids: HashMap> = HashMap::new(); + + for file_id in file_ids { + let parsed = crate::storage::needle::needle::FileId::parse(file_id) + .map_err(|e| format!("chunk delete {}: {}", file_id, e))?; + let volume_id = parsed.volume_id.0; + + let locations = if let Some(locations) = lookup_cache.get(&volume_id) { + locations.clone() + } else { + let locations = lookup_volume( + &state.http_client, + &state.outgoing_http_scheme, + &state.master_url, + volume_id, + ) + .await + .map_err(|e| format!("chunk delete {}: {}", file_id, e))?; + if locations.is_empty() { + return Err(format!("chunk delete {}: file not found", file_id)); + } + lookup_cache.insert(volume_id, locations.clone()); + locations + }; + + for location in locations { + let grpc_addr = grpc_address_for_location(&location) + .map_err(|e| format!("chunk delete {}: {}", file_id, e))?; + server_to_file_ids + .entry(grpc_addr) + .or_default() + .push(file_id.clone()); + } + } + + for (grpc_addr, batch) in server_to_file_ids { + let endpoint = build_grpc_endpoint(&grpc_addr, state.outgoing_grpc_tls.as_ref()) + .map_err(|e| format!("batch delete {}: {}", grpc_addr, e))?; + let channel = endpoint + .connect() + .await + .map_err(|e| format!("batch delete {}: {}", grpc_addr, e))?; + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + let response = client + .batch_delete(volume_server_pb::BatchDeleteRequest { + file_ids: batch.clone(), + skip_cookie_check: true, + }) + .await + .map_err(|e| format!("batch delete {}: {}", grpc_addr, e))? + .into_inner(); + + for result in response.results { + if !result.error.is_empty() { + return Err(format!("chunk delete {}: {}", result.file_id, result.error)); + } + if result.status >= 400 { + return Err(format!( + "chunk delete {}: status {}", + result.file_id, result.status + )); + } + } + } + + Ok(()) +} + +/// Helper to synchronously replicate a request to peer volume servers. +async fn do_replicated_request( + state: &VolumeServerState, + vid: u32, + method: axum::http::Method, + path: &str, + query: &str, + headers: &axum::http::HeaderMap, + body: Option, +) -> Result<(), String> { + let locations = lookup_volume( + &state.http_client, + &state.outgoing_http_scheme, + &state.master_url, + vid, + ) + .await + .map_err(|e| format!("lookup volume failed: {}", e))?; + + let remote_locations: Vec<_> = locations + .into_iter() + .filter(|loc| loc.url != state.self_url && loc.public_url != state.self_url) + .collect(); + + if remote_locations.is_empty() { + return Ok(()); + } + + let new_query = if query.is_empty() { + String::from("type=replicate") + } else { + format!("{}&type=replicate", query) + }; + + let mut futures = Vec::new(); + for loc in remote_locations { + let url = normalize_outgoing_http_url( + &state.outgoing_http_scheme, + &format!("{}{}?{}", loc.url, path, new_query), + )?; + let client = state.http_client.clone(); + + let mut req_builder = client.request(method.clone(), &url); + + // Forward relevant headers + if let Some(ct) = headers.get(axum::http::header::CONTENT_TYPE) { + req_builder = req_builder.header(axum::http::header::CONTENT_TYPE, ct); + } + if let Some(ce) = headers.get(axum::http::header::CONTENT_ENCODING) { + req_builder = req_builder.header(axum::http::header::CONTENT_ENCODING, ce); + } + if let Some(md5) = headers.get("Content-MD5") { + req_builder = req_builder.header("Content-MD5", md5); + } + if let Some(auth) = headers.get(axum::http::header::AUTHORIZATION) { + req_builder = req_builder.header(axum::http::header::AUTHORIZATION, auth); + } + + if let Some(ref b) = body { + req_builder = req_builder.body(b.clone()); + } + + futures.push(async move { + match req_builder.send().await { + Ok(r) if r.status().is_success() => Ok(()), + Ok(r) => Err(format!("{} returned status {}", url, r.status())), + Err(e) => Err(format!("{} failed: {}", url, e)), + } + }); + } + + let results = futures::future::join_all(futures).await; + let mut errors = Vec::new(); + for res in results { + if let Err(e) = res { + errors.push(e); + } + } + + if !errors.is_empty() { + return Err(errors.join(", ")); + } + + Ok(()) +} + +/// Extracted request info needed for proxy/redirect (avoids borrowing Request across await). +struct ProxyRequestInfo { + original_headers: HeaderMap, + original_query: String, + path: String, + vid_str: String, + fid_str: String, +} + +fn build_proxy_request_info( + path: &str, + headers: &HeaderMap, + query_string: &str, +) -> Option { + let trimmed = path.trim_start_matches('/'); + let (vid_str, fid_str) = if let Some(pos) = trimmed.find(',') { + let raw_fid = &trimmed[pos + 1..]; + let fid = if let Some(slash) = raw_fid.find('/') { + &raw_fid[..slash] + } else if let Some(dot) = raw_fid.rfind('.') { + &raw_fid[..dot] + } else { + raw_fid + }; + (trimmed[..pos].to_string(), fid.to_string()) + } else if let Some(pos) = trimmed.find('/') { + let after = &trimmed[pos + 1..]; + let fid_part = if let Some(slash) = after.find('/') { + &after[..slash] + } else { + after + }; + (trimmed[..pos].to_string(), fid_part.to_string()) + } else { + return None; + }; + + Some(ProxyRequestInfo { + original_headers: headers.clone(), + original_query: query_string.to_string(), + path: path.to_string(), + vid_str, + fid_str, + }) +} + +/// Handle proxy or redirect for a non-local volume read. +async fn proxy_or_redirect_to_target( + state: &VolumeServerState, + info: ProxyRequestInfo, + vid: VolumeId, + allow_local_redirect: bool, +) -> Response { + // Look up volume locations from master + let locations = match lookup_volume( + &state.http_client, + &state.outgoing_http_scheme, + &state.master_url, + vid.0, + ) + .await + { + Ok(locs) => locs, + Err(e) => { + tracing::warn!("volume lookup failed for {}: {}", vid.0, e); + return StatusCode::NOT_FOUND.into_response(); + } + }; + + if locations.is_empty() { + return StatusCode::NOT_FOUND.into_response(); + } + + // Filter out self, then shuffle remaining + let mut candidates: Vec<&VolumeLocation> = locations + .iter() + .filter(|loc| !loc.url.contains(&state.self_url)) + .collect(); + + if candidates.is_empty() { + return StatusCode::NOT_FOUND.into_response(); + } + + // Shuffle for load balancing + if candidates.len() >= 2 { + use rand::seq::SliceRandom; + let mut rng = rand::thread_rng(); + candidates.shuffle(&mut rng); + } + + let target = candidates[0]; + + match state.read_mode { + ReadMode::Proxy => proxy_request(state, &info, target).await, + ReadMode::Redirect => redirect_request(&info, target, &state.outgoing_http_scheme), + ReadMode::Local if allow_local_redirect => { + redirect_request(&info, target, &state.outgoing_http_scheme) + } + ReadMode::Local => unreachable!(), + } +} + +/// Proxy the request to the target volume server. +async fn proxy_request( + state: &VolumeServerState, + info: &ProxyRequestInfo, + target: &VolumeLocation, +) -> Response { + // Build target URL, adding proxied=true query param + let path = info.path.trim_start_matches('/'); + + let raw_target = if info.original_query.is_empty() { + format!("{}/{}?proxied=true", target.url, path) + } else { + format!( + "{}/{}?{}&proxied=true", + target.url, path, info.original_query + ) + }; + let target_url = match normalize_outgoing_http_url(&state.outgoing_http_scheme, &raw_target) { + Ok(url) => url, + Err(e) => { + tracing::warn!("proxy target url {} invalid: {}", raw_target, e); + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + }; + + // Build the proxy request + let mut req_builder = state.http_client.get(&target_url); + + // Forward all original headers + for (name, value) in &info.original_headers { + if let Ok(v) = value.to_str() { + req_builder = req_builder.header(name.as_str(), v); + } + } + + let resp = match req_builder.send().await { + Ok(r) => r, + Err(e) => { + tracing::warn!("proxy request to {} failed: {}", target_url, e); + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + }; + + // Build response, copying headers and body from remote + let status = + StatusCode::from_u16(resp.status().as_u16()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR); + let mut response_headers = HeaderMap::new(); + for (name, value) in resp.headers() { + if name.as_str().eq_ignore_ascii_case("server") { + continue; + } + response_headers.insert(name.clone(), value.clone()); + } + + // Stream the proxy response body instead of buffering it entirely + let byte_stream = resp.bytes_stream(); + let body = Body::from_stream(byte_stream); + + let mut response = Response::new(body); + *response.status_mut() = status; + *response.headers_mut() = response_headers; + response +} + +/// Return a redirect response to the target volume server. +fn redirect_request(info: &ProxyRequestInfo, target: &VolumeLocation, scheme: &str) -> Response { + // Build query string: preserve collection, add proxied=true, drop readDeleted (Go parity) + let mut query_params = Vec::new(); + if !info.original_query.is_empty() { + for param in info.original_query.split('&') { + if let Some((key, value)) = param.split_once('=') { + if key == "collection" { + query_params.push(format!("collection={}", value)); + } + // Intentionally drop readDeleted and other params (Go parity) + } + } + } + query_params.push("proxied=true".to_string()); + let query = query_params.join("&"); + + let raw_target = format!( + "{}/{},{}?{}", + target.url, &info.vid_str, &info.fid_str, query + ); + let location = match normalize_outgoing_http_url(scheme, &raw_target) { + Ok(url) => url, + Err(_) => return StatusCode::INTERNAL_SERVER_ERROR.into_response(), + }; + + Response::builder() + .status(StatusCode::MOVED_PERMANENTLY) + .header("Location", &location) + .header("Content-Type", "text/html; charset=utf-8") + .body(Body::from(format!( + "Moved Permanently.\n\n", + location + ))) + .unwrap_or_else(|_| StatusCode::INTERNAL_SERVER_ERROR.into_response()) +} + +// ============================================================================ +// Query parameters +// ============================================================================ + +#[derive(Deserialize, Default)] +pub struct ReadQueryParams { + #[serde(rename = "response-content-type")] + pub response_content_type: Option, + #[serde(rename = "response-cache-control")] + pub response_cache_control: Option, + pub dl: Option, + #[serde(rename = "readDeleted")] + pub read_deleted: Option, + /// cm=false disables chunk manifest expansion (returns raw manifest JSON). + pub cm: Option, + /// Image resize width + pub width: Option, + /// Image resize height + pub height: Option, + /// Image resize mode: "fit" or "fill" + pub mode: Option, + /// Image crop parameters + pub crop_x1: Option, + pub crop_y1: Option, + pub crop_x2: Option, + pub crop_y2: Option, + /// S3 response passthrough headers + #[serde(rename = "response-content-encoding")] + pub response_content_encoding: Option, + #[serde(rename = "response-expires")] + pub response_expires: Option, + #[serde(rename = "response-content-language")] + pub response_content_language: Option, + #[serde(rename = "response-content-disposition")] + pub response_content_disposition: Option, + /// Pretty print JSON response + pub pretty: Option, + /// JSONP callback function name + pub callback: Option, +} + +// ============================================================================ +// Read Handler (GET/HEAD) +// ============================================================================ + +/// Called from the method-dispatching store handler with a full Request. +pub async fn get_or_head_handler_from_request( + State(state): State>, + request: Request, +) -> Response { + let uri = request.uri().clone(); + let headers = request.headers().clone(); + + // Parse query params manually from URI + let query_params: ReadQueryParams = uri + .query() + .and_then(|q| serde_urlencoded::from_str(q).ok()) + .unwrap_or_default(); + + get_or_head_handler_inner(state, headers, query_params, request).await +} + +pub async fn get_or_head_handler( + State(state): State>, + headers: HeaderMap, + query: Query, + request: Request, +) -> Response { + get_or_head_handler_inner(state, headers, query.0, request).await +} + +async fn get_or_head_handler_inner( + state: Arc, + headers: HeaderMap, + query: ReadQueryParams, + request: Request, +) -> Response { + let path = request.uri().path().to_string(); + let raw_query = request.uri().query().map(|q| q.to_string()); + let method = request.method().clone(); + + // JWT check for reads — must happen BEFORE path parsing to match Go behavior. + // Go's GetOrHeadHandler calls maybeCheckJwtAuthorization before NewVolumeId, + // so invalid paths with JWT enabled return 401, not 400. + let file_id = extract_file_id(&path); + let token = extract_jwt(&headers, request.uri()); + if let Err(_) = + state + .guard + .read() + .unwrap() + .check_jwt_for_file(token.as_deref(), &file_id, false) + { + let body = serde_json::json!({"error": "wrong jwt"}); + return Response::builder() + .status(StatusCode::UNAUTHORIZED) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(serde_json::to_string(&body).unwrap())) + .unwrap(); + } + + let (vid, needle_id, cookie) = match parse_url_path(&path) { + Some(parsed) => parsed, + None => return StatusCode::BAD_REQUEST.into_response(), + }; + + // Check if volume exists locally; if not, proxy/redirect based on read_mode. + // This mirrors Go's hasVolume + hasEcVolume check in GetOrHeadHandler. + // NOTE: The RwLockReadGuard must be dropped before any .await to keep the future Send. + let has_volume = state.store.read().unwrap().has_volume(vid); + let has_ec_volume = state.store.read().unwrap().has_ec_volume(vid); + + if !has_volume && !has_ec_volume { + // Check if already proxied (loop prevention) + let query_string = request.uri().query().unwrap_or("").to_string(); + let is_proxied = query_string.contains("proxied=true"); + + if is_proxied || state.read_mode == ReadMode::Local || state.master_url.is_empty() { + return StatusCode::NOT_FOUND.into_response(); + } + + // For redirect, fid must be stripped of extension (Go parity: parseURLPath returns raw fid). + let info = match build_proxy_request_info(&path, request.headers(), &query_string) { + Some(info) => info, + None => return StatusCode::NOT_FOUND.into_response(), + }; + + return proxy_or_redirect_to_target(&state, info, vid, false).await; + } + + // Download throttling — matches Go's checkDownloadLimit + waitForDownloadSlot + let download_guard = if state.concurrent_download_limit > 0 { + let timeout = state.inflight_download_data_timeout; + let deadline = tokio::time::Instant::now() + timeout; + let query_string = request.uri().query().unwrap_or("").to_string(); + + let current = state.inflight_download_bytes.load(Ordering::Relaxed); + if current > state.concurrent_download_limit { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::DOWNLOAD_LIMIT_COND]) + .inc(); + + // Go tries proxy to replica ONCE before entering the blocking wait + // loop (checkDownloadLimit L65). It does NOT retry on each wakeup. + let should_try_replica = + !query_string.contains("proxied=true") && !state.master_url.is_empty() && { + let store = state.store.read().unwrap(); + store.find_volume(vid).map_or(false, |(_, vol)| { + vol.super_block.replica_placement.get_copy_count() > 1 + }) + }; + if should_try_replica { + if let Some(info) = + build_proxy_request_info(&path, request.headers(), &query_string) + { + return proxy_or_redirect_to_target(&state, info, vid, true).await; + } + } + + // Blocking wait loop (Go's waitForDownloadSlot) + loop { + if tokio::time::timeout_at(deadline, state.download_notify.notified()) + .await + .is_err() + { + return json_error_with_query( + StatusCode::TOO_MANY_REQUESTS, + "download limit exceeded", + raw_query.as_deref(), + ); + } + let current = state.inflight_download_bytes.load(Ordering::Relaxed); + if current <= state.concurrent_download_limit { + break; + } + } + } + // We'll set the actual bytes after reading the needle (once we know the size) + Some(state.clone()) + } else { + None + }; + + // Read needle — branching between regular volume and EC volume paths. + // EC volumes always do a full read (no streaming/meta-only). + let mut n = Needle { + id: needle_id, + cookie, + ..Needle::default() + }; + + let read_deleted = query.read_deleted.as_deref() == Some("true"); + let has_range = headers.contains_key(header::RANGE); + let ext = extract_extension_from_path(&path); + // Go checks resize and crop extensions separately: resize supports .webp, crop does not. + let has_resize_ops = + is_image_resize_ext(&ext) && (query.width.unwrap_or(0) > 0 || query.height.unwrap_or(0) > 0); + // Go's shouldCropImages (L410) requires x2 > x1 && y2 > y1 (x1/y1 default 0). + // Only disable streaming when a real crop will actually happen. + let has_crop_ops = is_image_crop_ext(&ext) && { + let x1 = query.crop_x1.unwrap_or(0); + let y1 = query.crop_y1.unwrap_or(0); + let x2 = query.crop_x2.unwrap_or(0); + let y2 = query.crop_y2.unwrap_or(0); + x2 > x1 && y2 > y1 + }; + let has_image_ops = has_resize_ops || has_crop_ops; + + // Stream info is only available for regular volumes, not EC volumes. + let stream_info; + let bypass_cm; + let track_download; + let can_stream; + let can_handle_head_from_meta; + let can_handle_range_from_source; + + if has_ec_volume && !has_volume { + // ---- EC volume read path (always full read, no streaming) ---- + let store = state.store.read().unwrap(); + match store.find_ec_volume(vid) { + Some(ecv) => match ecv.read_ec_shard_needle(needle_id) { + Ok(Some(ec_needle)) => { + n = ec_needle; + } + Ok(None) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_INTERNAL]) + .inc(); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read ec error: {}", e), + ) + .into_response(); + } + }, + None => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + } + drop(store); + + // Validate cookie (matches Go behavior after ReadEcShardNeedle) + if n.cookie != cookie { + return StatusCode::NOT_FOUND.into_response(); + } + + // EC volumes: no streaming support + stream_info = None; + bypass_cm = query.cm.as_deref() == Some("false"); + track_download = download_guard.is_some(); + can_stream = false; + can_handle_head_from_meta = false; + can_handle_range_from_source = false; + } else { + // ---- Regular volume read path (with streaming support) ---- + + // Try meta-only read first for potential streaming + let store = state.store.read().unwrap(); + let si_result = store.read_volume_needle_stream_info(vid, &mut n, read_deleted); + stream_info = match si_result { + Ok(info) => Some(info), + Err(crate::storage::volume::VolumeError::StreamingUnsupported) => None, + Err(crate::storage::volume::VolumeError::NotFound) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + Err(crate::storage::volume::VolumeError::Deleted) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + Err(e) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_INTERNAL]) + .inc(); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read error: {}", e), + ) + .into_response(); + } + }; + drop(store); + + // Validate cookie + if n.cookie != cookie { + return StatusCode::NOT_FOUND.into_response(); + } + + bypass_cm = query.cm.as_deref() == Some("false"); + track_download = download_guard.is_some(); + let can_direct_source_read = stream_info.is_some() + && !n.is_compressed() + && !(n.is_chunk_manifest() && !bypass_cm) + && !has_image_ops; + + // Determine if we can stream (large, direct-source eligible, no range) + can_stream = can_direct_source_read + && n.data_size > STREAMING_THRESHOLD + && !has_range + && method != Method::HEAD; + + // Go uses meta-only reads for all HEAD requests, regardless of compression/chunked files. + can_handle_head_from_meta = stream_info.is_some() && method == Method::HEAD; + can_handle_range_from_source = can_direct_source_read && has_range; + + // For chunk manifest or any non-streaming path, we need the full data. + // If we can't stream, do a full read now. + if !can_stream && !can_handle_head_from_meta && !can_handle_range_from_source { + // Re-read with full data + let mut n_full = Needle { + id: needle_id, + cookie, + ..Needle::default() + }; + let store = state.store.read().unwrap(); + match store.read_volume_needle_opt(vid, &mut n_full, read_deleted) { + Ok(count) => { + if count < 0 { + return StatusCode::NOT_FOUND.into_response(); + } + } + Err(crate::storage::volume::VolumeError::NotFound) => { + return StatusCode::NOT_FOUND.into_response(); + } + Err(crate::storage::volume::VolumeError::Deleted) => { + return StatusCode::NOT_FOUND.into_response(); + } + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read error: {}", e), + ) + .into_response(); + } + } + drop(store); + // Use the full needle from here (it has the same metadata + data) + n = n_full; + } + } + + // Build ETag and Last-Modified BEFORE conditional checks and chunk manifest expansion + // (matches Go order: conditional checks first, then chunk manifest) + let etag = format!("\"{}\"", n.etag()); + + // Build Last-Modified header (RFC 1123 format) — must be done before conditional checks + let last_modified_str = if n.last_modified > 0 { + use chrono::{TimeZone, Utc}; + if let Some(dt) = Utc.timestamp_opt(n.last_modified as i64, 0).single() { + Some(dt.format("%a, %d %b %Y %H:%M:%S GMT").to_string()) + } else { + None + } + } else { + None + }; + + // Check If-Modified-Since FIRST (Go checks this before If-None-Match) + if n.last_modified > 0 { + if let Some(ims_header) = headers.get(header::IF_MODIFIED_SINCE) { + if let Ok(ims_str) = ims_header.to_str() { + // Parse HTTP date format: "Mon, 02 Jan 2006 15:04:05 GMT" + if let Ok(ims_time) = + chrono::NaiveDateTime::parse_from_str(ims_str, "%a, %d %b %Y %H:%M:%S GMT") + { + if (n.last_modified as i64) <= ims_time.and_utc().timestamp() { + let mut resp = StatusCode::NOT_MODIFIED.into_response(); + if let Some(ref lm) = last_modified_str { + resp.headers_mut() + .insert(header::LAST_MODIFIED, lm.parse().unwrap()); + } + // Go sets ETag AFTER the 304 return paths (L235), so 304 does NOT include ETag + return resp; + } + } + } + } + } + + // Check If-None-Match SECOND + if let Some(if_none_match) = headers.get(header::IF_NONE_MATCH) { + if let Ok(inm) = if_none_match.to_str() { + if inm == etag { + let mut resp = StatusCode::NOT_MODIFIED.into_response(); + if let Some(ref lm) = last_modified_str { + resp.headers_mut() + .insert(header::LAST_MODIFIED, lm.parse().unwrap()); + } + // Go sets ETag AFTER the 304 return paths (L235), so 304 does NOT include ETag + return resp; + } + } + } + + // Chunk manifest expansion (needs full data) — after conditional checks, before response + // Pass ETag so chunk manifest responses include it (matches Go: ETag is set on the + // response writer before tryHandleChunkedFile runs). + if n.is_chunk_manifest() && !bypass_cm { + if let Some(resp) = try_expand_chunk_manifest( + &state, + &n, + &headers, + &method, + &path, + &query, + &etag, + &last_modified_str, + ) { + return resp; + } + // If manifest expansion fails (invalid JSON etc.), fall through to raw data + } + + let mut response_headers = HeaderMap::new(); + response_headers.insert(header::ETAG, etag.parse().unwrap()); + + // H1: Emit pairs as response headers + if n.has_pairs() && !n.pairs.is_empty() { + if let Ok(pair_map) = + serde_json::from_slice::>(&n.pairs) + { + for (k, v) in &pair_map { + if let (Ok(hname), Ok(hval)) = ( + axum::http::HeaderName::from_bytes(k.as_bytes()), + axum::http::HeaderValue::from_str(v), + ) { + response_headers.insert(hname, hval); + } + } + } + } + + // H8: Use needle stored name when URL path has no filename (only vid,fid) + let mut filename = extract_filename_from_path(&path); + let mut ext = ext; + if n.name_size > 0 && filename.is_empty() { + filename = String::from_utf8_lossy(&n.name).to_string(); + if ext.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + ext = filename[dot_pos..].to_lowercase(); + } + } + } + + // H6: Determine Content-Type: filter application/octet-stream, use mime_guess + // For chunk manifests, skip extension-based MIME override — use stored MIME as-is (Go parity) + let content_type = if let Some(ref ct) = query.response_content_type { + Some(ct.clone()) + } else if n.is_chunk_manifest() { + // Chunk manifests: use stored MIME but filter application/octet-stream (Go L334) + if !n.mime.is_empty() { + let mt = String::from_utf8_lossy(&n.mime).to_string(); + if mt.starts_with("application/octet-stream") { + None + } else { + Some(mt) + } + } else { + None + } + } else { + // Get MIME from needle, but filter out application/octet-stream + let needle_mime = if !n.mime.is_empty() { + let mt = String::from_utf8_lossy(&n.mime).to_string(); + if mt.starts_with("application/octet-stream") { + String::new() + } else { + mt + } + } else { + String::new() + }; + + if !needle_mime.is_empty() { + Some(needle_mime) + } else { + // Fall through to extension-based detection + let detect_ext = if !ext.is_empty() { + ext.clone() + } else if !filename.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + filename[dot_pos..].to_lowercase() + } else { + String::new() + } + } else { + String::new() + }; + if !detect_ext.is_empty() { + mime_guess::from_ext(detect_ext.trim_start_matches('.')) + .first() + .map(|m| m.to_string()) + } else { + None // Omit Content-Type entirely + } + } + }; + if let Some(ref ct) = content_type { + response_headers.insert(header::CONTENT_TYPE, ct.parse().unwrap()); + } + + // Cache-Control override from query param + if let Some(ref cc) = query.response_cache_control { + response_headers.insert(header::CACHE_CONTROL, cc.parse().unwrap()); + } + + // S3 response passthrough headers + if let Some(ref ce) = query.response_content_encoding { + response_headers.insert(header::CONTENT_ENCODING, ce.parse().unwrap()); + } + if let Some(ref exp) = query.response_expires { + response_headers.insert(header::EXPIRES, exp.parse().unwrap()); + } + if let Some(ref cl) = query.response_content_language { + response_headers.insert("Content-Language", cl.parse().unwrap()); + } + if let Some(ref cd) = query.response_content_disposition { + response_headers.insert(header::CONTENT_DISPOSITION, cd.parse().unwrap()); + } + + // Last-Modified + if let Some(ref lm) = last_modified_str { + response_headers.insert(header::LAST_MODIFIED, lm.parse().unwrap()); + } + + // H7: Content-Disposition — inline by default, attachment only when dl is truthy + // Only set if not already set by response-content-disposition query param + if !response_headers.contains_key(header::CONTENT_DISPOSITION) && !filename.is_empty() { + let disposition_type = if let Some(ref dl_val) = query.dl { + if parse_go_bool(dl_val).unwrap_or(false) { + "attachment" + } else { + "inline" + } + } else { + "inline" + }; + let disposition = format_content_disposition(disposition_type, &filename); + if let Ok(hval) = disposition.parse() { + response_headers.insert(header::CONTENT_DISPOSITION, hval); + } + } + + // ---- Streaming path: large uncompressed files ---- + if can_stream { + if let Some(info) = stream_info { + response_headers.insert(header::ACCEPT_RANGES, "bytes".parse().unwrap()); + response_headers.insert( + header::CONTENT_LENGTH, + info.data_size.to_string().parse().unwrap(), + ); + + let tracked_bytes = info.data_size as i64; + let tracking_state = if download_guard.is_some() { + let new_val = state + .inflight_download_bytes + .fetch_add(tracked_bytes, Ordering::Relaxed) + + tracked_bytes; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + Some(state.clone()) + } else { + None + }; + + let streaming = StreamingBody { + source: info.source, + data_offset: info.data_file_offset, + data_size: info.data_size, + pos: 0, + chunk_size: streaming_chunk_size( + state.read_buffer_size_bytes, + info.data_size as usize, + ), + _held_read_lease: if state.has_slow_read { + None + } else { + Some(info.data_file_access_control.read_lock()) + }, + data_file_access_control: info.data_file_access_control, + hold_read_lock_for_stream: !state.has_slow_read, + pending: None, + state: tracking_state, + tracked_bytes, + server_state: state.clone(), + volume_id: info.volume_id, + needle_id: info.needle_id, + compaction_revision: info.compaction_revision, + }; + + let body = Body::new(streaming); + let mut resp = Response::new(body); + *resp.status_mut() = StatusCode::OK; + *resp.headers_mut() = response_headers; + return resp; + } + } + + if can_handle_head_from_meta { + if let Some(info) = stream_info { + response_headers.insert( + header::CONTENT_LENGTH, + info.data_size.to_string().parse().unwrap(), + ); + return (StatusCode::OK, response_headers).into_response(); + } + } + + if can_handle_range_from_source { + if let (Some(range_header), Some(info)) = (headers.get(header::RANGE), stream_info) { + if let Ok(range_str) = range_header.to_str() { + return handle_range_request_from_source( + range_str, + info, + response_headers, + track_download.then(|| state.clone()), + ); + } + } + } + + // ---- Buffered path: small files, compressed, images, range requests ---- + + // Handle compressed data: if needle is compressed, either pass through or decompress + let is_compressed = n.is_compressed(); + let mut data = n.data; + + // Check if image operations are needed — must decompress first regardless of Accept-Encoding + // Go checks resize (.webp OK) and crop (.webp NOT OK) separately. + let needs_image_ops = has_resize_ops || has_crop_ops; + + if is_compressed { + if needs_image_ops { + // Always decompress for image operations (Go decompresses before resize/crop) + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + data = decompressed; + } + } else { + let accept_encoding = headers + .get(header::ACCEPT_ENCODING) + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + if accept_encoding.contains("gzip") + && data.len() >= 2 + && data[0] == 0x1f + && data[1] == 0x8b + { + // Go checks IsGzippedContent (magic bytes 0x1f 0x8b) before + // setting Content-Encoding: gzip + response_headers.insert(header::CONTENT_ENCODING, "gzip".parse().unwrap()); + } else { + // Decompress for client + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + data = decompressed; + } + } + } + } + + // Image crop and resize — Go checks extensions separately per operation. + // Crop: .png .jpg .jpeg .gif (no .webp). Resize: .png .jpg .jpeg .gif .webp. + if is_image_crop_ext(&ext) { + data = maybe_crop_image(&data, &ext, &query); + } + if is_image_resize_ext(&ext) { + data = maybe_resize_image(&data, &ext, &query); + } + + // Accept-Ranges + response_headers.insert(header::ACCEPT_RANGES, "bytes".parse().unwrap()); + + // Check Range header + if let Some(range_header) = headers.get(header::RANGE) { + if let Ok(range_str) = range_header.to_str() { + return handle_range_request( + range_str, + &data, + response_headers, + track_download.then(|| state.clone()), + ); + } + } + + if method == Method::HEAD { + response_headers.insert( + header::CONTENT_LENGTH, + data.len().to_string().parse().unwrap(), + ); + return (StatusCode::OK, response_headers).into_response(); + } + + finalize_bytes_response( + StatusCode::OK, + response_headers, + data, + track_download.then(|| state.clone()), + ) +} + +/// Handle HTTP Range requests. Returns 206 Partial Content or 416 Range Not Satisfiable. +#[derive(Clone, Copy)] +struct HttpRange { + start: i64, + length: i64, +} + +fn parse_range_header(s: &str, size: i64) -> Result, &'static str> { + if s.is_empty() { + return Ok(Vec::new()); + } + const PREFIX: &str = "bytes="; + if !s.starts_with(PREFIX) { + return Err("invalid range"); + } + let mut ranges = Vec::new(); + for part in s[PREFIX.len()..].split(',') { + let part = part.trim(); + if part.is_empty() { + continue; + } + let Some(pos) = part.find('-') else { + return Err("invalid range"); + }; + let start_str = part[..pos].trim(); + let end_str = part[pos + 1..].trim(); + let mut r = HttpRange { + start: 0, + length: 0, + }; + if start_str.is_empty() { + let mut i = end_str.parse::().map_err(|_| "invalid range")?; + if i > size { + i = size; + } + r.start = size - i; + r.length = size - r.start; + } else { + let i = start_str.parse::().map_err(|_| "invalid range")?; + if i > size || i < 0 { + return Err("invalid range"); + } + r.start = i; + if end_str.is_empty() { + r.length = size - r.start; + } else { + let mut i = end_str.parse::().map_err(|_| "invalid range")?; + if r.start > i { + return Err("invalid range"); + } + if i >= size { + i = size - 1; + } + r.length = i - r.start + 1; + } + } + ranges.push(r); + } + Ok(ranges) +} + +fn sum_ranges_size(ranges: &[HttpRange]) -> i64 { + ranges.iter().map(|r| r.length).sum() +} + +fn range_content_range(r: HttpRange, total: i64) -> String { + format!("bytes {}-{}/{}", r.start, r.start + r.length - 1, total) +} + +fn range_error_response(mut headers: HeaderMap, msg: &str) -> Response { + if !headers.contains_key(header::CONTENT_TYPE) { + headers.insert( + header::CONTENT_TYPE, + "text/plain; charset=utf-8".parse().unwrap(), + ); + } + let mut response = Response::new(Body::from(msg.to_string())); + *response.status_mut() = StatusCode::RANGE_NOT_SATISFIABLE; + *response.headers_mut() = headers; + response +} + +fn handle_range_request( + range_str: &str, + data: &[u8], + mut headers: HeaderMap, + state: Option>, +) -> Response { + let total = data.len() as i64; + let ranges = match parse_range_header(range_str, total) { + Ok(r) => r, + Err(msg) => return range_error_response(headers, msg), + }; + + // Go's ProcessRangeRequest returns nil (empty body) for empty or oversized ranges + if ranges.is_empty() { + return (StatusCode::OK, headers).into_response(); + } + + if sum_ranges_size(&ranges) > total { + return (StatusCode::OK, headers).into_response(); + } + + if ranges.len() == 1 { + let r = ranges[0]; + headers.insert( + "Content-Range", + range_content_range(r, total).parse().unwrap(), + ); + headers.insert( + header::CONTENT_LENGTH, + r.length.max(0).to_string().parse().unwrap(), + ); + if r.length <= 0 { + return (StatusCode::PARTIAL_CONTENT, headers).into_response(); + } + let start = r.start as usize; + let end = (r.start + r.length) as usize; + let slice = &data[start..end]; + finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, slice.to_vec(), state) + } else { + // Multi-range: build multipart/byteranges response + let boundary = "SeaweedFSBoundary"; + let content_type = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("application/octet-stream") + .to_string(); + + let mut body = Vec::new(); + for (i, r) in ranges.iter().enumerate() { + // First boundary has no leading CRLF per RFC 2046 + if i == 0 { + body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); + } else { + body.extend_from_slice(format!("\r\n--{}\r\n", boundary).as_bytes()); + } + body.extend_from_slice(format!("Content-Type: {}\r\n", content_type).as_bytes()); + body.extend_from_slice( + format!("Content-Range: {}\r\n\r\n", range_content_range(*r, total)).as_bytes(), + ); + if r.length > 0 { + let start = r.start as usize; + let end = (r.start + r.length) as usize; + body.extend_from_slice(&data[start..end]); + } + } + body.extend_from_slice(format!("\r\n--{}--\r\n", boundary).as_bytes()); + + headers.insert( + header::CONTENT_TYPE, + format!("multipart/byteranges; boundary={}", boundary) + .parse() + .unwrap(), + ); + if !headers.contains_key(header::CONTENT_ENCODING) { + headers.insert( + header::CONTENT_LENGTH, + body.len().to_string().parse().unwrap(), + ); + } + finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, body, state) + } +} + +fn handle_range_request_from_source( + range_str: &str, + info: crate::storage::volume::NeedleStreamInfo, + mut headers: HeaderMap, + state: Option>, +) -> Response { + let total = info.data_size as i64; + let ranges = match parse_range_header(range_str, total) { + Ok(r) => r, + Err(msg) => return range_error_response(headers, msg), + }; + + if ranges.is_empty() { + return (StatusCode::OK, headers).into_response(); + } + + if sum_ranges_size(&ranges) > total { + return (StatusCode::OK, headers).into_response(); + } + + let read_slice = |start: i64, length: i64| -> Result, std::io::Error> { + if length <= 0 { + return Ok(Vec::new()); + } + let mut buf = vec![0u8; length as usize]; + info.source + .read_exact_at(&mut buf, info.data_file_offset + start as u64)?; + Ok(buf) + }; + + if ranges.len() == 1 { + let r = ranges[0]; + let slice = match read_slice(r.start, r.length) { + Ok(slice) => slice, + Err(err) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("range read error: {}", err), + ) + .into_response() + } + }; + headers.insert( + "Content-Range", + range_content_range(r, total).parse().unwrap(), + ); + headers.insert( + header::CONTENT_LENGTH, + slice.len().to_string().parse().unwrap(), + ); + return finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, slice, state); + } + + let boundary = "SeaweedFSBoundary"; + let content_type = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("application/octet-stream") + .to_string(); + + let mut body = Vec::new(); + for (i, r) in ranges.iter().enumerate() { + let slice = match read_slice(r.start, r.length) { + Ok(slice) => slice, + Err(err) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("range read error: {}", err), + ) + .into_response() + } + }; + if i == 0 { + body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); + } else { + body.extend_from_slice(format!("\r\n--{}\r\n", boundary).as_bytes()); + } + body.extend_from_slice(format!("Content-Type: {}\r\n", content_type).as_bytes()); + body.extend_from_slice( + format!("Content-Range: {}\r\n\r\n", range_content_range(*r, total)).as_bytes(), + ); + body.extend_from_slice(&slice); + } + body.extend_from_slice(format!("\r\n--{}--\r\n", boundary).as_bytes()); + + headers.insert( + header::CONTENT_TYPE, + format!("multipart/byteranges; boundary={}", boundary) + .parse() + .unwrap(), + ); + if !headers.contains_key(header::CONTENT_ENCODING) { + headers.insert( + header::CONTENT_LENGTH, + body.len().to_string().parse().unwrap(), + ); + } + finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, body, state) +} + +/// Extract filename from URL path like "/vid/fid/filename.ext" +fn extract_filename_from_path(path: &str) -> String { + let parts: Vec<&str> = path.trim_start_matches('/').split('/').collect(); + if parts.len() >= 3 { + parts[2].to_string() + } else { + String::new() + } +} + +fn path_base(path: &str) -> String { + let trimmed = path.trim_end_matches('/'); + trimmed + .rsplit('/') + .find(|s| !s.is_empty()) + .unwrap_or("") + .to_string() +} + +fn parse_go_bool(value: &str) -> Option { + match value { + "1" | "t" | "T" | "TRUE" | "True" | "true" => Some(true), + "0" | "f" | "F" | "FALSE" | "False" | "false" => Some(false), + _ => None, + } +} + +/// Format Content-Disposition header value per RFC 6266. +/// +/// Matches Go's `mime.FormatMediaType(dispositionType, map[string]string{"filename": filename})`: +/// - Simple ASCII names (alphanumeric, hyphen, underscore, dot): `attachment; filename=file.txt` +/// - ASCII names with spaces/special chars: `attachment; filename="my file.txt"` +/// - Non-ASCII names: `attachment; filename*=utf-8''percent-encoded-name` +fn format_content_disposition(disposition_type: &str, filename: &str) -> String { + let is_ascii = filename.bytes().all(|b| b.is_ascii()); + if is_ascii { + // Check if the filename is a simple "token" (no quoting needed). + // RFC 2616 token chars: any CHAR except CTLs or separators. + // Go's mime.FormatMediaType uses needsQuoting which checks for non-token chars. + let is_token = !filename.is_empty() + && filename.bytes().all(|b| { + b > 0x20 + && b < 0x7f + && !matches!( + b, + b'(' | b')' + | b'<' + | b'>' + | b'@' + | b',' + | b';' + | b':' + | b'\\' + | b'"' + | b'/' + | b'[' + | b']' + | b'?' + | b'=' + | b' ' + ) + }); + if is_token { + format!("{}; filename={}", disposition_type, filename) + } else { + // Quote the filename, escaping backslashes and quotes + let escaped = filename.replace('\\', "\\\\").replace('"', "\\\""); + format!("{}; filename=\"{}\"", disposition_type, escaped) + } + } else { + // Non-ASCII: use RFC 2231 encoding with filename* parameter + let encoded = percent_encode_rfc2231(filename); + format!("{}; filename*=utf-8''{}", disposition_type, encoded) + } +} + +/// Percent-encode a string for RFC 2231 filename* parameter. +/// Encodes all bytes except unreserved chars (ALPHA / DIGIT / "-" / "." / "_" / "~"). +fn percent_encode_rfc2231(s: &str) -> String { + let mut out = String::with_capacity(s.len() * 3); + for byte in s.bytes() { + if byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'.' | b'_' | b'~') { + out.push(byte as char); + } else { + out.push('%'); + out.push(char::from(HEX_UPPER[byte as usize >> 4])); + out.push(char::from(HEX_UPPER[byte as usize & 0x0f])); + } + } + out +} + +const HEX_UPPER: [u8; 16] = *b"0123456789ABCDEF"; + +// ============================================================================ +// Image processing helpers +// ============================================================================ + +fn is_image_resize_ext(ext: &str) -> bool { + matches!(ext, ".png" | ".jpg" | ".jpeg" | ".gif" | ".webp") +} + +/// Go's shouldCropImages only supports these four formats (no .webp). +fn is_image_crop_ext(ext: &str) -> bool { + matches!(ext, ".png" | ".jpg" | ".jpeg" | ".gif") +} + +fn extract_extension_from_path(path: &str) -> String { + let parts: Vec<&str> = path.trim_start_matches('/').split('/').collect(); + if parts.len() >= 3 { + // 3-segment path: /vid/fid/filename.ext + let filename = parts[2]; + if let Some(dot_pos) = filename.rfind('.') { + return filename[dot_pos..].to_lowercase(); + } + } else if parts.len() >= 1 { + // 2-segment path: /vid,fid.ext or /vid/fid.ext + // Go's parseURLPath extracts ext from the full path for all formats + let last = parts[parts.len() - 1]; + if let Some(dot_pos) = last.rfind('.') { + return last[dot_pos..].to_lowercase(); + } + } + String::new() +} + +fn maybe_resize_image(data: &[u8], ext: &str, query: &ReadQueryParams) -> Vec { + let width = query.width.unwrap_or(0); + let height = query.height.unwrap_or(0); + if width == 0 && height == 0 { + return data.to_vec(); + } + + let img = match image::load_from_memory(data) { + Ok(img) => img, + Err(_) => return data.to_vec(), + }; + + let (src_w, src_h) = (img.width(), img.height()); + // Only resize if source is larger than target + if (width == 0 || src_w <= width) && (height == 0 || src_h <= height) { + return data.to_vec(); + } + + let mode = query.mode.as_deref().unwrap_or(""); + let resized = match mode { + "fit" => img.resize(width, height, image::imageops::FilterType::Lanczos3), + "fill" => img.resize_to_fill(width, height, image::imageops::FilterType::Lanczos3), + _ => { + if width > 0 && height > 0 && width == height && src_w != src_h { + img.resize_to_fill(width, height, image::imageops::FilterType::Lanczos3) + } else { + img.resize(width, height, image::imageops::FilterType::Lanczos3) + } + } + }; + + encode_image(&resized, ext).unwrap_or_else(|| data.to_vec()) +} + +fn maybe_crop_image(data: &[u8], ext: &str, query: &ReadQueryParams) -> Vec { + let (x1, y1, x2, y2) = match (query.crop_x2, query.crop_y2) { + (Some(x2), Some(y2)) => { + let x1 = query.crop_x1.unwrap_or(0); + let y1 = query.crop_y1.unwrap_or(0); + if x2 > x1 && y2 > y1 { + (x1, y1, x2, y2) + } else { + return data.to_vec(); + } + } + _ => return data.to_vec(), + }; + + let img = match image::load_from_memory(data) { + Ok(img) => img, + Err(_) => return data.to_vec(), + }; + + let (src_w, src_h) = (img.width(), img.height()); + if x2 > src_w || y2 > src_h { + return data.to_vec(); + } + + let cropped = img.crop_imm(x1, y1, x2 - x1, y2 - y1); + encode_image(&cropped, ext).unwrap_or_else(|| data.to_vec()) +} + +fn encode_image(img: &image::DynamicImage, ext: &str) -> Option> { + use std::io::Cursor; + let mut buf = Cursor::new(Vec::new()); + let format = match ext { + ".png" => image::ImageFormat::Png, + ".jpg" | ".jpeg" => image::ImageFormat::Jpeg, + ".gif" => image::ImageFormat::Gif, + ".webp" => image::ImageFormat::WebP, + _ => return None, + }; + img.write_to(&mut buf, format).ok()?; + Some(buf.into_inner()) +} + +// ============================================================================ +// Write Handler (POST/PUT) +// ============================================================================ + +#[derive(Serialize)] +struct UploadResult { + #[serde(skip_serializing_if = "String::is_empty")] + name: String, + #[serde(skip_serializing_if = "is_zero_u32")] + size: u32, + #[serde(rename = "eTag", skip_serializing_if = "String::is_empty")] + etag: String, + #[serde(skip_serializing_if = "String::is_empty")] + mime: String, + #[serde(rename = "contentMd5", skip_serializing_if = "Option::is_none")] + content_md5: Option, +} + +fn is_zero_u32(v: &u32) -> bool { + *v == 0 +} + +pub async fn post_handler( + State(state): State>, + request: Request, +) -> Response { + let path = request.uri().path().to_string(); + let query = request.uri().query().unwrap_or("").to_string(); + let method = request.method().clone(); + let headers = request.headers().clone(); + let query_fields: Vec<(String, String)> = match serde_urlencoded::from_str(&query) { + Ok(fields) => fields, + Err(e) => { + // Go's r.ParseForm() returns 400 on malformed query strings + return json_error_with_query( + StatusCode::BAD_REQUEST, + &format!("form parse error: {}", e), + Some(&query), + ); + } + }; + + let (vid, needle_id, cookie) = match parse_url_path(&path) { + Some(parsed) => parsed, + None => { + return json_error_with_query(StatusCode::BAD_REQUEST, "invalid URL path", Some(&query)) + } + }; + + // JWT check for writes + let file_id = extract_file_id(&path); + let token = extract_jwt(&headers, request.uri()); + if let Err(_) = state + .guard + .read() + .unwrap() + .check_jwt_for_file(token.as_deref(), &file_id, true) + { + return json_error_with_query(StatusCode::UNAUTHORIZED, "wrong jwt", Some(&query)); + } + + // Upload throttling: check inflight bytes against limit + let is_replicate = query.split('&').any(|p| p == "type=replicate"); + let content_length = headers + .get(header::CONTENT_LENGTH) + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0); + + if !is_replicate && state.concurrent_upload_limit > 0 { + // Wait for inflight bytes to drop below limit, or timeout + let timeout = if state.inflight_upload_data_timeout.is_zero() { + std::time::Duration::from_secs(2) + } else { + state.inflight_upload_data_timeout + }; + let deadline = tokio::time::Instant::now() + timeout; + + loop { + let current = state.inflight_upload_bytes.load(Ordering::Relaxed); + if current <= state.concurrent_upload_limit { + break; + } + // Go increments UploadLimitCond on every loop iteration (L184), + // not just on timeout. + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::UPLOAD_LIMIT_COND]) + .inc(); + // Wait for notification or timeout + if tokio::time::timeout_at(deadline, state.upload_notify.notified()) + .await + .is_err() + { + return json_error_with_query( + StatusCode::TOO_MANY_REQUESTS, + "upload limit exceeded", + Some(&query), + ); + } + } + let new_val = state + .inflight_upload_bytes + .fetch_add(content_length, Ordering::Relaxed) + + content_length; + metrics::INFLIGHT_UPLOAD_SIZE.set(new_val); + } + + // RAII guard to release upload throttle on any exit path + let _upload_guard = if !is_replicate && state.concurrent_upload_limit > 0 { + Some(InflightGuard { + counter: &state.inflight_upload_bytes, + bytes: content_length, + notify: &state.upload_notify, + metric: &metrics::INFLIGHT_UPLOAD_SIZE, + }) + } else { + None + }; + + let content_type_str = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + + // Go only parses multipart form-data for POST requests with form-data content type. + let should_parse_multipart = method == Method::POST && content_type_str.contains("form-data"); + + // Validate multipart/form-data has a boundary + if should_parse_multipart && !content_type_str.contains("boundary=") { + return json_error_with_query( + StatusCode::BAD_REQUEST, + "no multipart boundary param in Content-Type", + Some(&query), + ); + } + + let content_md5 = headers + .get("Content-MD5") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + // Read body + let body = match axum::body::to_bytes(request.into_body(), usize::MAX).await { + Ok(b) => b, + Err(e) => { + return json_error_with_query( + StatusCode::BAD_REQUEST, + format!("read body: {}", e), + Some(&query), + ) + } + }; + + // H5: Multipart form-data parsing + let ( + body_data_raw, + parsed_filename, + parsed_content_type, + parsed_content_encoding, + parsed_content_md5, + multipart_form_fields, + ) = if should_parse_multipart { + // Extract boundary from Content-Type + let boundary = content_type_str + .split(';') + .find_map(|part| { + let part = part.trim(); + if let Some(val) = part.strip_prefix("boundary=") { + Some(val.trim_matches('"').to_string()) + } else { + None + } + }) + .unwrap_or_default(); + + let mut multipart = multer::Multipart::new( + futures::stream::once(async { Ok::<_, std::io::Error>(body.clone()) }), + boundary, + ); + + let mut file_data: Option> = None; + let mut first_part_data: Option> = None; + let mut file_name: Option = None; + let mut file_content_type: Option = None; + let mut file_content_encoding: Option = None; + let mut file_content_md5: Option = None; + let mut form_fields = std::collections::HashMap::new(); + + while let Ok(Some(field)) = multipart.next_field().await { + let field_name = field.name().map(|s| s.to_string()); + let fname = field.file_name().map(clean_windows_path_base); + let fct = field.content_type().map(|m| m.to_string()); + let field_headers = field.headers().clone(); + let fce = field_headers + .get(header::CONTENT_ENCODING) + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + let fmd5 = field_headers + .get("Content-MD5") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + if let Ok(data) = field.bytes().await { + // Go reads the first part's data unconditionally, then looks for + // a part with a filename. If no part has a filename, Go uses the + // first part's data (with empty filename). + if first_part_data.is_none() { + first_part_data = Some(data.to_vec()); + } + if file_data.is_none() && fname.is_some() { + // Found a file field — use this part's data + file_data = Some(data.to_vec()); + file_name = fname; + file_content_type = fct; + file_content_encoding = fce; + file_content_md5 = fmd5; + } else if let Some(name) = field_name { + form_fields + .entry(name) + .or_insert_with(|| String::from_utf8_lossy(&data).to_string()); + } + } + } + + if let Some(data) = file_data { + ( + data, + file_name.unwrap_or_default(), + file_content_type, + file_content_encoding, + file_content_md5, + form_fields, + ) + } else if let Some(data) = first_part_data { + // No file field found, use first part's data (matching Go behavior) + (data, String::new(), None, None, None, form_fields) + } else { + // No parts at all + (Vec::new(), String::new(), None, None, None, form_fields) + } + } else { + ( + body.to_vec(), + String::new(), + None, + None, + None, + std::collections::HashMap::new(), + ) + }; + + let form_value = |name: &str| { + query_fields + .iter() + .find_map(|(k, v)| if k == name { Some(v.clone()) } else { None }) + .or_else(|| multipart_form_fields.get(name).cloned()) + }; + + // Check for chunk manifest flag. + // Go uses r.FormValue("cm"), which falls back to multipart fields when present. + let is_chunk_manifest = matches!( + form_value("cm").as_deref(), + Some("1" | "t" | "T" | "TRUE" | "True" | "true") + ); + + // Check file size limit (matches Go: "file over the limited %d bytes") + if state.file_size_limit_bytes > 0 && body_data_raw.len() as i64 > state.file_size_limit_bytes { + return json_error_with_query( + StatusCode::BAD_REQUEST, + format!( + "file over the limited {} bytes", + state.file_size_limit_bytes + ), + Some(&query), + ); + } + + // Check if upload is pre-compressed + let is_gzipped = if should_parse_multipart { + parsed_content_encoding.as_deref() == Some("gzip") + } else { + headers + .get(header::CONTENT_ENCODING) + .and_then(|v| v.to_str().ok()) + .map(|s| s == "gzip") + .unwrap_or(false) + }; + + let uncompressed_data = if is_gzipped { + maybe_decompress_gzip(&body_data_raw).unwrap_or_else(|| body_data_raw.clone()) + } else { + body_data_raw.clone() + }; + let original_data_size = uncompressed_data.len() as u32; + + // Only compute and validate Content-MD5 when the client provided one + // (Go only computes MD5 when Content-MD5 header/field is present) + let content_md5 = content_md5.or(parsed_content_md5); + let original_content_md5 = if content_md5.is_some() { + Some(compute_md5_base64(&uncompressed_data)) + } else { + None + }; + if let (Some(ref expected_md5), Some(ref actual_md5)) = (&content_md5, &original_content_md5) { + if expected_md5 != actual_md5 { + return json_error_with_query( + StatusCode::BAD_REQUEST, + format!( + "Content-MD5 did not match md5 of file data expected [{}] received [{}] size {}", + expected_md5, actual_md5, original_data_size + ), + Some(&query), + ); + } + } + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + // Parse custom timestamp from query param + let ts_str = form_value("ts").unwrap_or_default(); + let last_modified = if !ts_str.is_empty() { + ts_str.parse::().unwrap_or(now) + } else { + now + }; + + // Prefer the multipart filename before deriving MIME and other metadata. + let filename = if !parsed_filename.is_empty() { + parsed_filename + } else if !should_parse_multipart { + headers + .get(header::CONTENT_DISPOSITION) + .and_then(|v| v.to_str().ok()) + .and_then(parse_content_disposition_filename) + .unwrap_or_else(|| path_base(&path)) + } else { + extract_filename_from_path(&path) + }; + + // Extract MIME type: prefer multipart-parsed content type, else from Content-Type header + let mime_type = if let Some(ref pct) = parsed_content_type { + pct.clone() + } else { + let multipart_fallback = + if should_parse_multipart && !filename.is_empty() && !is_chunk_manifest { + mime_guess::from_path(&filename) + .first() + .map(|m| m.to_string()) + .unwrap_or_default() + } else { + String::new() + }; + headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .map(|ct| { + if should_parse_multipart && ct.starts_with("multipart/") { + multipart_fallback.clone() + } else { + ct.to_string() + } + }) + .unwrap_or(multipart_fallback) + }; + + // Parse TTL from query param (matches Go's r.FormValue("ttl")) + let ttl_str = form_value("ttl").unwrap_or_default(); + let ttl = if !ttl_str.is_empty() { + crate::storage::needle::TTL::read(&ttl_str).ok() + } else { + None + }; + + // Extract Seaweed-* custom metadata headers (pairs) + // Go's net/http canonicalizes header names to Title-Case, so after stripping + // the "Seaweed-" prefix, keys are Title-Case (e.g., "Foo-Bar"). Rust's http + // crate lowercases all header names, so we must convert the stripped key to + // Title-Case to match Go's behavior. + fn to_title_case(s: &str) -> String { + let mut result = String::with_capacity(s.len()); + let mut capitalize_next = true; + for c in s.chars() { + if c == '-' { + result.push('-'); + capitalize_next = true; + } else if capitalize_next { + for uc in c.to_uppercase() { + result.push(uc); + } + capitalize_next = false; + } else { + result.push(c); + } + } + result + } + let pair_map: std::collections::HashMap = headers + .iter() + .filter_map(|(k, v)| { + let key = k.as_str(); + if key.len() > 8 && key[..8].eq_ignore_ascii_case("seaweed-") { + if let Ok(val) = v.to_str() { + // Store with the prefix stripped and Title-Cased (matching Go's trimmedPairMap) + Some((to_title_case(&key[8..]), val.to_string())) + } else { + None + } + } else { + None + } + }) + .collect(); + + // Fix JPEG orientation from EXIF data before storing (matches Go behavior). + let body_data = if state.fix_jpg_orientation && crate::images::is_jpeg(&mime_type, &path) { + crate::images::fix_jpg_orientation(&body_data_raw) + } else { + body_data_raw + }; + + // Auto-compress compressible file types (matches Go's IsCompressableFileType). + // Only compress if not already gzipped and compression saves >10%. + // Go uses filepath.Base(pu.FileName) for extension detection (not the URL path). + let (final_data, final_is_gzipped) = if !is_gzipped && !is_chunk_manifest { + let ext = { + let dot_pos = filename.rfind('.'); + dot_pos + .map(|p| filename[p..].to_lowercase()) + .unwrap_or_default() + }; + if is_compressible_file_type(&ext, &mime_type) { + if let Some(compressed) = try_gzip_data(&body_data) { + if compressed.len() * 10 < body_data.len() * 9 { + (compressed, true) + } else { + (body_data, false) + } + } else { + (body_data, false) + } + } else { + (body_data, false) + } + } else { + (body_data, is_gzipped) + }; + + let mut n = Needle { + id: needle_id, + cookie, + data_size: final_data.len() as u32, + data: final_data, + last_modified: last_modified, + ..Needle::default() + }; + n.set_has_last_modified_date(); + if is_chunk_manifest { + n.set_is_chunk_manifest(); + } + if final_is_gzipped { + n.set_is_compressed(); + } + + // Go sets HasMime even for empty MIME types: if len(pu.MimeType) < 256 + if mime_type.len() < 256 { + n.mime = mime_type.as_bytes().to_vec(); + n.set_has_mime(); + } + + // Set TTL on needle + if let Some(ref t) = ttl { + if !t.is_empty() { + n.ttl = Some(*t); + n.set_has_ttl(); + } + } + + // Set pairs on needle + if !pair_map.is_empty() { + if let Ok(pairs_json) = serde_json::to_vec(&pair_map) { + if pairs_json.len() < 65536 { + n.pairs_size = pairs_json.len() as u16; + n.pairs = pairs_json; + n.set_has_pairs(); + } + } + } + + // Set filename on needle (matches Go: if len(pu.FileName) < 256) + // Go sets HasName even for empty filenames + if filename.len() < 256 { + n.name = filename.as_bytes().to_vec(); + n.name_size = filename.len() as u8; + n.set_has_name(); + } + + let write_result = if let Some(wq) = state.write_queue.get() { + wq.submit(vid, n.clone()).await + } else { + let mut store = state.store.write().unwrap(); + store.write_volume_needle(vid, &mut n) + }; + + // Replicate to remote volume servers if this volume has replicas. + // Matches Go's GetWritableRemoteReplications: skip if copy_count == 1. + if !is_replicate && write_result.is_ok() && !state.master_url.is_empty() { + let needs_replication = { + let store = state.store.read().unwrap(); + store.find_volume(vid).map_or(false, |(_, v)| { + v.super_block.replica_placement.get_copy_count() > 1 + }) + }; + if needs_replication { + let state_clone = state.clone(); + let path_clone = path.clone(); + let query_clone = query.clone(); + let headers_clone = headers.clone(); + let body_clone = body.clone(); + let replication = tokio::spawn(async move { + do_replicated_request( + &state_clone, + vid.0, + Method::POST, + &path_clone, + &query_clone, + &headers_clone, + Some(body_clone), + ) + .await + }); + let replication_result = replication + .await + .map_err(|e| format!("replication task failed: {}", e)) + .and_then(|result| result); + if let Err(e) = replication_result { + tracing::error!("replicated write failed: {}", e); + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("replication failed: {}", e), + Some(&query), + ); + } + } + } + + let resp = match write_result { + Ok((_offset, _size, is_unchanged)) => { + if is_unchanged { + let etag = format!("\"{}\"", n.etag()); + (StatusCode::NO_CONTENT, [(header::ETAG, etag)]).into_response() + } else { + // Go only includes contentMd5 when the client provided Content-MD5 + let result = UploadResult { + name: if n.has_name() { + filename.clone() + } else { + String::new() + }, + size: original_data_size, // H3: use original size, not compressed + etag: n.etag(), + mime: mime_type.clone(), + content_md5: original_content_md5.clone(), + }; + let etag = n.etag(); + let etag_header = if etag.starts_with('"') { + etag.clone() + } else { + format!("\"{}\"", etag) + }; + let mut resp = json_result_with_query(StatusCode::CREATED, &result, &query); + resp.headers_mut() + .insert(header::ETAG, etag_header.parse().unwrap()); + if let Some(ref md5_value) = original_content_md5 { + resp.headers_mut() + .insert("Content-MD5", md5_value.parse().unwrap()); + } + resp + } + } + Err(e) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_WRITE_TO_LOCAL_DISK]) + .inc(); + json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("{}", e), + Some(&query), + ) + } + }; + + // _upload_guard drops here, releasing inflight bytes + resp +} + +// ============================================================================ +// Delete Handler +// ============================================================================ + +#[derive(Serialize)] +struct DeleteResult { + size: i64, +} + +pub async fn delete_handler( + State(state): State>, + request: Request, +) -> Response { + let path = request.uri().path().to_string(); + let del_query = request.uri().query().unwrap_or("").to_string(); + let del_params: ReadQueryParams = serde_urlencoded::from_str(&del_query).unwrap_or_default(); + let headers = request.headers().clone(); + + let (vid, needle_id, cookie) = match parse_url_path(&path) { + Some(parsed) => parsed, + None => { + return json_error_with_query( + StatusCode::BAD_REQUEST, + "invalid URL path", + Some(&del_query), + ) + } + }; + + // JWT check for writes (deletes use write key) + let file_id = extract_file_id(&path); + let token = extract_jwt(&headers, request.uri()); + if let Err(_) = state + .guard + .read() + .unwrap() + .check_jwt_for_file(token.as_deref(), &file_id, true) + { + return json_error_with_query(StatusCode::UNAUTHORIZED, "wrong jwt", Some(&del_query)); + } + + // Check for EC volume first (Go checks hasEcVolume before regular volume in DeleteHandler). + // Go's flow: FindEcVolume -> DeleteEcShardNeedle(ecVolume, n, cookie) -> writeDeleteResult + // DeleteEcShardNeedle: reads needle (for size + cookie validation), validates cookie, journals delete. + { + let has_ec = state.store.read().unwrap().has_ec_volume(vid); + if has_ec { + // Step 1: Read the EC needle to get its size and validate cookie + let ec_read_result = { + let store = state.store.read().unwrap(); + store + .find_ec_volume(vid) + .map(|ecv| ecv.read_ec_shard_needle(needle_id)) + }; + match ec_read_result { + Some(Ok(Some(ec_needle))) => { + // Step 2: Validate cookie (Go: cookie != 0 && cookie != n.Cookie) + if cookie.0 != 0 && ec_needle.cookie != cookie { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: unexpected cookie {:x}", cookie.0), + Some(&del_query), + ); + } + let count = ec_needle.data_size as i64; + // Step 3: Journal the delete + let mut store = state.store.write().unwrap(); + if let Some(ecv) = store.find_ec_volume_mut(vid) { + if let Err(e) = ecv.journal_delete(needle_id) { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: {}", e), + Some(&del_query), + ); + } + } + let result = DeleteResult { size: count }; + return json_response_with_params( + StatusCode::ACCEPTED, + &result, + Some(&del_params), + ); + } + Some(Ok(None)) => { + // Needle not found in EC volume + let result = DeleteResult { size: 0 }; + return json_response_with_params( + StatusCode::NOT_FOUND, + &result, + Some(&del_params), + ); + } + Some(Err(e)) => { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: {}", e), + Some(&del_query), + ); + } + None => { + // EC volume disappeared between has_ec check and find + let result = DeleteResult { size: 0 }; + return json_response_with_params( + StatusCode::NOT_FOUND, + &result, + Some(&del_params), + ); + } + } + } + } + + // H9: Parse custom timestamp from query param; default to now (not 0) + let del_ts_str = del_query + .split('&') + .find_map(|p| p.strip_prefix("ts=")) + .unwrap_or(""); + let del_last_modified = if !del_ts_str.is_empty() { + del_ts_str.parse::().unwrap_or_else(|_| { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + }) + } else { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + }; + + let mut n = Needle { + id: needle_id, + cookie, + ..Needle::default() + }; + + // Read needle first to validate cookie (matching Go behavior) + let original_cookie = cookie; + { + let store = state.store.read().unwrap(); + match store.read_volume_needle(vid, &mut n) { + Ok(_) => {} + Err(_) => { + let result = DeleteResult { size: 0 }; + return json_response_with_params( + StatusCode::NOT_FOUND, + &result, + Some(&del_params), + ); + } + } + } + if n.cookie != original_cookie { + return json_error_with_query( + StatusCode::BAD_REQUEST, + "File Random Cookie does not match.", + Some(&del_query), + ); + } + + // Apply custom timestamp (always set — defaults to now per H9) + n.last_modified = del_last_modified; + n.set_has_last_modified_date(); + + let mut delete_size_override = None; + + // If this is a chunk manifest, delete child chunks first + if n.is_chunk_manifest() { + let manifest_data = if n.is_compressed() { + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&n.data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + decompressed + } else { + n.data.clone() + } + } else { + n.data.clone() + }; + + let manifest = match serde_json::from_slice::(&manifest_data) { + Ok(manifest) => manifest, + Err(e) => { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Load chunks manifest error: {}", e), + Some(&del_query), + ); + } + }; + + let child_fids: Vec = manifest + .chunks + .iter() + .map(|chunk| chunk.fid.clone()) + .collect(); + if let Err(e) = batch_delete_file_ids(&state, &child_fids).await { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Delete chunks error: {}", e), + Some(&del_query), + ); + } + delete_size_override = Some(manifest.size as i64); + } + + let delete_result = { + let mut store = state.store.write().unwrap(); + store.delete_volume_needle(vid, &mut n) + }; + + let is_replicate = del_query.split('&').any(|p| p == "type=replicate"); + if !is_replicate && delete_result.is_ok() && !state.master_url.is_empty() { + let needs_replication = { + let store = state.store.read().unwrap(); + store.find_volume(vid).map_or(false, |(_, v)| { + v.super_block.replica_placement.get_copy_count() > 1 + }) + }; + if needs_replication { + if let Err(e) = do_replicated_request( + &state, + vid.0, + Method::DELETE, + &path, + &del_query, + &headers, + None, + ) + .await + { + tracing::error!("replicated delete failed: {}", e); + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("replication failed: {}", e), + Some(&del_query), + ); + } + } + } + + match delete_result { + Ok(size) => { + let result = DeleteResult { + size: delete_size_override.unwrap_or(size.0 as i64), + }; + json_response_with_params(StatusCode::ACCEPTED, &result, Some(&del_params)) + } + Err(crate::storage::volume::VolumeError::NotFound) => { + let result = DeleteResult { size: 0 }; + json_response_with_params(StatusCode::NOT_FOUND, &result, Some(&del_params)) + } + Err(e) => json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: {}", e), + Some(&del_query), + ), + } +} + +// ============================================================================ +// Status Handler +// ============================================================================ + +pub async fn status_handler( + Query(params): Query, + State(state): State>, +) -> Response { + let store = state.store.read().unwrap(); + let mut volumes = Vec::new(); + + for loc in &store.locations { + for (_vid, vol) in loc.volumes() { + let mut vol_info = serde_json::Map::new(); + vol_info.insert("Id".to_string(), serde_json::Value::from(vol.id.0)); + vol_info.insert( + "Collection".to_string(), + serde_json::Value::from(vol.collection.clone()), + ); + vol_info.insert( + "Size".to_string(), + serde_json::Value::from(vol.content_size()), + ); + vol_info.insert( + "FileCount".to_string(), + serde_json::Value::from(vol.file_count()), + ); + vol_info.insert( + "DeleteCount".to_string(), + serde_json::Value::from(vol.deleted_count()), + ); + vol_info.insert( + "DeletedByteCount".to_string(), + serde_json::Value::from(vol.deleted_size()), + ); + vol_info.insert( + "ReadOnly".to_string(), + serde_json::Value::from(vol.is_read_only()), + ); + vol_info.insert( + "Version".to_string(), + serde_json::Value::from(vol.version().0), + ); + vol_info.insert( + "CompactRevision".to_string(), + serde_json::Value::from(vol.super_block.compaction_revision), + ); + vol_info.insert( + "ModifiedAtSecond".to_string(), + serde_json::Value::from(vol.last_modified_ts()), + ); + vol_info.insert( + "DiskType".to_string(), + serde_json::Value::from(loc.disk_type.to_string()), + ); + + let replica = &vol.super_block.replica_placement; + let mut replica_value = serde_json::Map::new(); + if replica.diff_data_center_count > 0 { + replica_value.insert( + "dc".to_string(), + serde_json::Value::from(replica.diff_data_center_count), + ); + } + if replica.diff_rack_count > 0 { + replica_value.insert( + "rack".to_string(), + serde_json::Value::from(replica.diff_rack_count), + ); + } + if replica.same_rack_count > 0 { + replica_value.insert( + "node".to_string(), + serde_json::Value::from(replica.same_rack_count), + ); + } + vol_info.insert( + "ReplicaPlacement".to_string(), + serde_json::Value::Object(replica_value), + ); + + let ttl = vol.super_block.ttl; + let mut ttl_value = serde_json::Map::new(); + if ttl.count > 0 { + ttl_value.insert("Count".to_string(), serde_json::Value::from(ttl.count)); + } + if ttl.unit > 0 { + ttl_value.insert("Unit".to_string(), serde_json::Value::from(ttl.unit)); + } + vol_info.insert("Ttl".to_string(), serde_json::Value::Object(ttl_value)); + + let (remote_storage_name, remote_storage_key) = vol.remote_storage_name_key(); + vol_info.insert( + "RemoteStorageName".to_string(), + serde_json::Value::from(remote_storage_name), + ); + vol_info.insert( + "RemoteStorageKey".to_string(), + serde_json::Value::from(remote_storage_key), + ); + volumes.push(serde_json::Value::Object(vol_info)); + } + } + volumes.sort_by(|a, b| { + let left = a.get("Id").and_then(|v| v.as_u64()).unwrap_or_default(); + let right = b.get("Id").and_then(|v| v.as_u64()).unwrap_or_default(); + left.cmp(&right) + }); + + let mut m = serde_json::Map::new(); + m.insert( + "Version".to_string(), + serde_json::Value::from(crate::version::version()), + ); + m.insert("Volumes".to_string(), serde_json::Value::Array(volumes)); + m.insert( + "DiskStatuses".to_string(), + serde_json::Value::Array(build_disk_statuses(&store)), + ); + json_response_with_params(StatusCode::OK, &serde_json::Value::Object(m), Some(¶ms)) +} + +// ============================================================================ +// Health Check Handler +// ============================================================================ + +pub async fn healthz_handler(State(state): State>) -> Response { + // Go's healthzHandler returns only status codes with no body text. + let is_stopping = *state.is_stopping.read().unwrap(); + if is_stopping { + return StatusCode::SERVICE_UNAVAILABLE.into_response(); + } + // If not heartbeating, return 503 (matches Go health check behavior) + if !state.is_heartbeating.load(Ordering::Relaxed) { + return StatusCode::SERVICE_UNAVAILABLE.into_response(); + } + StatusCode::OK.into_response() +} + +// ============================================================================ +// Metrics Handler +// ============================================================================ + +pub async fn metrics_handler() -> Response { + let body = metrics::gather_metrics(); + ( + StatusCode::OK, + [( + header::CONTENT_TYPE, + "text/plain; version=0.0.4; charset=utf-8", + )], + body, + ) + .into_response() +} + +// ============================================================================ +// Stats Handlers +// ============================================================================ + +pub async fn stats_counter_handler(Query(params): Query) -> Response { + let payload = serde_json::json!({ + "Version": crate::version::version(), + "Counters": super::server_stats::snapshot(), + }); + json_response_with_params(StatusCode::OK, &payload, Some(¶ms)) +} + +pub async fn stats_memory_handler(Query(params): Query) -> Response { + let mem = super::memory_status::collect_mem_status(); + let payload = serde_json::json!({ + "Version": crate::version::version(), + "Memory": { + "goroutines": mem.goroutines, + "all": mem.all, + "used": mem.used, + "free": mem.free, + "self": mem.self_, + "heap": mem.heap, + "stack": mem.stack, + }, + }); + json_response_with_params(StatusCode::OK, &payload, Some(¶ms)) +} + +pub async fn stats_disk_handler( + Query(params): Query, + State(state): State>, +) -> Response { + let store = state.store.read().unwrap(); + let payload = serde_json::json!({ + "Version": crate::version::version(), + "DiskStatuses": build_disk_statuses(&store), + }); + json_response_with_params(StatusCode::OK, &payload, Some(¶ms)) +} + +// ============================================================================ +// Static Asset Handlers +// ============================================================================ + +pub async fn favicon_handler() -> Response { + let asset = super::ui::favicon_asset(); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, asset.content_type)], + asset.bytes, + ) + .into_response() +} + +pub async fn static_asset_handler(Path(path): Path) -> Response { + match super::ui::lookup_static_asset(&path) { + Some(asset) => ( + StatusCode::OK, + [(header::CONTENT_TYPE, asset.content_type)], + asset.bytes, + ) + .into_response(), + None => StatusCode::NOT_FOUND.into_response(), + } +} + +pub async fn ui_handler(State(state): State>) -> Response { + let html = super::ui::render_volume_server_html(&state); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/html; charset=utf-8")], + html, + ) + .into_response() +} + +// ============================================================================ +// Chunk Manifest +// ============================================================================ + +#[derive(Deserialize)] +#[allow(dead_code)] +struct ChunkManifest { + #[serde(default)] + name: String, + #[serde(default)] + mime: String, + #[serde(default)] + size: i64, + #[serde(default)] + chunks: Vec, +} + +#[derive(Deserialize)] +struct ChunkInfo { + fid: String, + offset: i64, + #[allow(dead_code)] + size: i64, +} + +/// Try to expand a chunk manifest needle. Returns None if manifest can't be parsed. +fn try_expand_chunk_manifest( + state: &Arc, + n: &Needle, + _headers: &HeaderMap, + method: &Method, + path: &str, + query: &ReadQueryParams, + etag: &str, + last_modified_str: &Option, +) -> Option { + let data = if n.is_compressed() { + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&n.data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_err() { + return None; + } + decompressed + } else { + n.data.clone() + }; + + let manifest: ChunkManifest = match serde_json::from_slice(&data) { + Ok(m) => m, + Err(_) => return None, + }; + + // Read and concatenate all chunks + let mut result = vec![0u8; manifest.size as usize]; + let store = state.store.read().unwrap(); + for chunk in &manifest.chunks { + let (chunk_vid, chunk_nid, chunk_cookie) = match parse_url_path(&chunk.fid) { + Some(p) => p, + None => { + return Some( + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("invalid chunk fid: {}", chunk.fid), + ) + .into_response(), + ) + } + }; + let mut chunk_needle = Needle { + id: chunk_nid, + cookie: chunk_cookie, + ..Needle::default() + }; + match store.read_volume_needle(chunk_vid, &mut chunk_needle) { + Ok(_) => {} + Err(e) => { + return Some( + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read chunk {}: {}", chunk.fid, e), + ) + .into_response(), + ) + } + } + let chunk_data = if chunk_needle.is_compressed() { + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&chunk_needle.data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + decompressed + } else { + chunk_needle.data.clone() + } + } else { + chunk_needle.data.clone() + }; + let offset = chunk.offset as usize; + let end = std::cmp::min(offset + chunk_data.len(), result.len()); + let copy_len = end - offset; + if copy_len > 0 { + result[offset..offset + copy_len].copy_from_slice(&chunk_data[..copy_len]); + } + } + + // Determine filename: URL path filename, then manifest name + // (Go's tryHandleChunkedFile does NOT fall back to needle name) + let mut filename = extract_filename_from_path(path); + if filename.is_empty() && !manifest.name.is_empty() { + filename = manifest.name.clone(); + } + + // Determine MIME type: manifest mime, but fall back to extension detection + // if empty or application/octet-stream (matching Go behavior) + let content_type = { + let mime_str = &manifest.mime; + if !mime_str.is_empty() && !mime_str.starts_with("application/octet-stream") { + mime_str.clone() + } else { + // Try to detect from filename extension + let ext = if !filename.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + filename[dot_pos..].to_lowercase() + } else { + String::new() + } + } else { + String::new() + }; + if !ext.is_empty() { + mime_guess::from_ext(ext.trim_start_matches('.')) + .first() + .map(|m| m.to_string()) + .unwrap_or_else(|| "application/octet-stream".to_string()) + } else if !mime_str.is_empty() { + mime_str.clone() + } else { + "application/octet-stream".to_string() + } + } + }; + + let mut response_headers = HeaderMap::new(); + // Preserve ETag from the needle (matches Go: ETag is set before tryHandleChunkedFile) + if let Ok(etag_val) = etag.parse() { + response_headers.insert(header::ETAG, etag_val); + } + response_headers.insert(header::CONTENT_TYPE, content_type.parse().unwrap()); + response_headers.insert("X-File-Store", "chunked".parse().unwrap()); + response_headers.insert(header::ACCEPT_RANGES, "bytes".parse().unwrap()); + + // Last-Modified — Go sets this on the response writer before tryHandleChunkedFile + if let Some(ref lm) = last_modified_str { + if let Ok(hval) = lm.parse() { + response_headers.insert(header::LAST_MODIFIED, hval); + } + } + + // Pairs — Go sets needle pairs on the response writer before tryHandleChunkedFile + if n.has_pairs() && !n.pairs.is_empty() { + if let Ok(pair_map) = + serde_json::from_slice::>(&n.pairs) + { + for (k, v) in &pair_map { + if let (Ok(hname), Ok(hval)) = ( + axum::http::HeaderName::from_bytes(k.as_bytes()), + axum::http::HeaderValue::from_str(v), + ) { + response_headers.insert(hname, hval); + } + } + } + } + + // S3 response passthrough headers — Go sets these via AdjustPassthroughHeaders + if let Some(ref cc) = query.response_cache_control { + if let Ok(hval) = cc.parse() { + response_headers.insert(header::CACHE_CONTROL, hval); + } + } + if let Some(ref ce) = query.response_content_encoding { + if let Ok(hval) = ce.parse() { + response_headers.insert(header::CONTENT_ENCODING, hval); + } + } + if let Some(ref exp) = query.response_expires { + if let Ok(hval) = exp.parse() { + response_headers.insert(header::EXPIRES, hval); + } + } + if let Some(ref cl) = query.response_content_language { + if let Ok(hval) = cl.parse() { + response_headers.insert("Content-Language", hval); + } + } + if let Some(ref cd) = query.response_content_disposition { + if let Ok(hval) = cd.parse() { + response_headers.insert(header::CONTENT_DISPOSITION, hval); + } + } + + // Content-Disposition + if !filename.is_empty() { + let disposition_type = if let Some(ref dl_val) = query.dl { + if parse_go_bool(dl_val).unwrap_or(false) { + "attachment" + } else { + "inline" + } + } else { + "inline" + }; + let disposition = format_content_disposition(disposition_type, &filename); + if let Ok(hval) = disposition.parse() { + response_headers.insert(header::CONTENT_DISPOSITION, hval); + } + } + + // Go's tryHandleChunkedFile applies crop then resize to expanded chunk data + // (L344-345: conditionallyCropImages, conditionallyResizeImages). + let cm_ext = if !filename.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + filename[dot_pos..].to_lowercase() + } else { + String::new() + } + } else { + String::new() + }; + let mut result = result; + if is_image_crop_ext(&cm_ext) { + result = maybe_crop_image(&result, &cm_ext, query); + } + if is_image_resize_ext(&cm_ext) { + result = maybe_resize_image(&result, &cm_ext, query); + } + + if *method == Method::HEAD { + response_headers.insert( + header::CONTENT_LENGTH, + result.len().to_string().parse().unwrap(), + ); + return Some((StatusCode::OK, response_headers).into_response()); + } + + Some((StatusCode::OK, response_headers, result).into_response()) +} + +// ============================================================================ +// Helpers +// ============================================================================ + +fn absolute_display_path(path: &str) -> String { + let p = std::path::Path::new(path); + if p.is_absolute() { + return path.to_string(); + } + std::env::current_dir() + .map(|cwd| cwd.join(p).to_string_lossy().to_string()) + .unwrap_or_else(|_| path.to_string()) +} + +fn build_disk_statuses(store: &crate::storage::store::Store) -> Vec { + let mut disk_statuses = Vec::new(); + for loc in &store.locations { + let resolved_dir = absolute_display_path(&loc.directory); + let (all, free) = crate::storage::disk_location::get_disk_stats(&resolved_dir); + let used = all.saturating_sub(free); + let percent_free = if all > 0 { + (free as f64 / all as f64) * 100.0 + } else { + 0.0 + }; + let percent_used = if all > 0 { + (used as f64 / all as f64) * 100.0 + } else { + 0.0 + }; + + // Match Go encoding/json on protobuf struct (snake_case json tags) + disk_statuses.push(serde_json::json!({ + "dir": resolved_dir, + "all": all, + "used": used, + "free": free, + "percent_free": percent_free, + "percent_used": percent_used, + "disk_type": loc.disk_type.to_string(), + })); + } + disk_statuses +} + +/// Serialize to JSON with 1-space indent (matches Go's `json.MarshalIndent(obj, "", " ")`). +fn to_pretty_json(value: &T) -> String { + let mut buf = Vec::new(); + let formatter = serde_json::ser::PrettyFormatter::with_indent(b" "); + let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter); + value.serialize(&mut ser).unwrap(); + String::from_utf8(buf).unwrap() +} + +fn json_response_with_params( + status: StatusCode, + body: &T, + params: Option<&ReadQueryParams>, +) -> Response { + let is_pretty = params + .and_then(|params| params.pretty.as_ref()) + .is_some_and(|value| !value.is_empty()); + let callback = params + .and_then(|params| params.callback.as_ref()) + .filter(|value| !value.is_empty()) + .cloned(); + + let json_body = if is_pretty { + to_pretty_json(body) + } else { + serde_json::to_string(body).unwrap() + }; + + if let Some(callback) = callback { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/javascript") + .body(Body::from(format!("{}({})", callback, json_body))) + .unwrap() + } else { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(json_body)) + .unwrap() + } +} + +/// Return a JSON error response with optional query string for pretty/JSONP support. +/// Supports `?pretty=` for pretty-printed JSON and `?callback=fn` for JSONP, +/// matching Go's writeJsonError behavior. +pub(super) fn json_error_with_query( + status: StatusCode, + msg: impl Into, + query: Option<&str>, +) -> Response { + let body = serde_json::json!({"error": msg.into()}); + + let (is_pretty, callback) = if let Some(q) = query { + let pretty = q + .split('&') + .any(|p| p.starts_with("pretty=") && p.len() > "pretty=".len()); + let cb = q + .split('&') + .find_map(|p| p.strip_prefix("callback=")) + .map(|s| s.to_string()); + (pretty, cb) + } else { + (false, None) + }; + + let json_body = if is_pretty { + to_pretty_json(&body) + } else { + serde_json::to_string(&body).unwrap() + }; + + if let Some(cb) = callback { + let jsonp = format!("{}({})", cb, json_body); + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/javascript") + .body(Body::from(jsonp)) + .unwrap() + } else { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(json_body)) + .unwrap() + } +} + +/// Return a JSON response with optional pretty/JSONP support from raw query string. +/// Matches Go's writeJsonQuiet behavior for write success responses. +fn json_result_with_query(status: StatusCode, body: &T, query: &str) -> Response { + let (is_pretty, callback) = { + let pretty = query + .split('&') + .any(|p| p.starts_with("pretty=") && p.len() > "pretty=".len()); + let cb = query + .split('&') + .find_map(|p| p.strip_prefix("callback=")) + .map(|s| s.to_string()); + (pretty, cb) + }; + + let json_body = if is_pretty { + to_pretty_json(body) + } else { + serde_json::to_string(body).unwrap() + }; + + if let Some(cb) = callback { + let jsonp = format!("{}({})", cb, json_body); + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/javascript") + .body(Body::from(jsonp)) + .unwrap() + } else { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(json_body)) + .unwrap() + } +} + +/// Extract JWT token from query param, Authorization header, or Cookie. +/// Query param takes precedence over header, header over cookie. +fn extract_jwt(headers: &HeaderMap, uri: &axum::http::Uri) -> Option { + // 1. Check ?jwt= query parameter + if let Some(query) = uri.query() { + for pair in query.split('&') { + if let Some(value) = pair.strip_prefix("jwt=") { + if !value.is_empty() { + return Some(value.to_string()); + } + } + } + } + + // 2. Check Authorization: Bearer (case-insensitive prefix) + if let Some(auth) = headers.get(header::AUTHORIZATION) { + if let Ok(auth_str) = auth.to_str() { + if auth_str.len() > 7 && auth_str[..7].eq_ignore_ascii_case("bearer ") { + return Some(auth_str[7..].to_string()); + } + } + } + + // 3. Check Cookie + if let Some(cookie_header) = headers.get(header::COOKIE) { + if let Ok(cookie_str) = cookie_header.to_str() { + for cookie in cookie_str.split(';') { + let cookie = cookie.trim(); + if let Some(value) = cookie.strip_prefix("AT=") { + if !value.is_empty() { + return Some(value.to_string()); + } + } + } + } + } + + None +} + +// ============================================================================ +// Auto-compression helpers (matches Go's util.IsCompressableFileType) +// ============================================================================ + +/// Check if a file type should be compressed based on extension and MIME type. +/// Returns true only when we are sure the type is compressible. +fn is_compressible_file_type(ext: &str, mtype: &str) -> bool { + // text/* + if mtype.starts_with("text/") { + return true; + } + // Compressible image/audio formats + match ext { + ".svg" | ".bmp" | ".wav" => return true, + _ => {} + } + // Most image/* formats are already compressed + if mtype.starts_with("image/") { + return false; + } + // By file extension + match ext { + ".zip" | ".rar" | ".gz" | ".bz2" | ".xz" | ".zst" | ".br" => return false, + ".pdf" | ".txt" | ".html" | ".htm" | ".css" | ".js" | ".json" => return true, + ".php" | ".java" | ".go" | ".rb" | ".c" | ".cpp" | ".h" | ".hpp" => return true, + ".png" | ".jpg" | ".jpeg" => return false, + _ => {} + } + // By MIME type + if mtype.starts_with("application/") { + if mtype.ends_with("zstd") { + return false; + } + if mtype.ends_with("xml") { + return true; + } + if mtype.ends_with("script") { + return true; + } + if mtype.ends_with("vnd.rar") { + return false; + } + } + if mtype.starts_with("audio/") { + let sub = mtype.strip_prefix("audio/").unwrap_or(""); + if matches!(sub, "wave" | "wav" | "x-wav" | "x-pn-wav") { + return true; + } + } + false +} + +/// Try to gzip data. Returns None on error. +fn try_gzip_data(data: &[u8]) -> Option> { + use flate2::write::GzEncoder; + use flate2::Compression; + use std::io::Write; + let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); + encoder.write_all(data).ok()?; + encoder.finish().ok() +} + +fn maybe_decompress_gzip(data: &[u8]) -> Option> { + use flate2::read::GzDecoder; + use std::io::Read; + let mut decoder = GzDecoder::new(data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed).ok()?; + Some(decompressed) +} + +fn compute_md5_base64(data: &[u8]) -> String { + use base64::Engine; + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + hasher.update(data); + base64::engine::general_purpose::STANDARD.encode(hasher.finalize()) +} + +fn clean_windows_path_base(value: &str) -> String { + let cleaned = value.replace('\\', "/"); + cleaned.rsplit('/').next().unwrap_or(&cleaned).to_string() +} + +fn parse_content_disposition_filename(value: &str) -> Option { + let mut filename: Option = None; + let mut name: Option = None; + + for segment in value.split(';') { + let segment = segment.trim(); + if segment.is_empty() { + continue; + } + let lower = segment.to_ascii_lowercase(); + if lower.starts_with("filename=") { + let raw = segment[9..].trim(); + let trimmed = raw + .strip_prefix('\"') + .and_then(|s| s.strip_suffix('\"')) + .unwrap_or(raw); + filename = Some(clean_windows_path_base(trimmed)); + } else if lower.starts_with("name=") { + let raw = segment[5..].trim(); + let trimmed = raw + .strip_prefix('\"') + .and_then(|s| s.strip_suffix('\"')) + .unwrap_or(raw); + name = Some(clean_windows_path_base(trimmed)); + } + } + + let candidate = filename.or(name); + candidate.filter(|s| !s.is_empty()) +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_url_path_comma() { + let (vid, nid, cookie) = parse_url_path("/3,01637037d6").unwrap(); + assert_eq!(vid, VolumeId(3)); + assert_eq!(nid, NeedleId(0x01)); + assert_eq!(cookie, Cookie(0x637037d6)); + } + + #[test] + fn test_parse_url_path_with_ext() { + let (vid, _, _) = parse_url_path("/3,01637037d6.jpg").unwrap(); + assert_eq!(vid, VolumeId(3)); + } + + #[test] + fn test_parse_url_path_slash() { + let result = parse_url_path("3/01637037d6"); + assert!(result.is_some()); + } + + #[test] + fn test_parse_url_path_slash_with_filename() { + let result = parse_url_path("3/01637037d6/report.txt"); + assert!(result.is_some()); + let (vid, _, _) = result.unwrap(); + assert_eq!(vid, VolumeId(3)); + } + + #[test] + fn test_parse_url_path_invalid() { + assert!(parse_url_path("/invalid").is_none()); + assert!(parse_url_path("").is_none()); + } + + #[test] + fn test_extract_jwt_bearer() { + let mut headers = HeaderMap::new(); + headers.insert(header::AUTHORIZATION, "Bearer abc123".parse().unwrap()); + let uri: axum::http::Uri = "/test".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), Some("abc123".to_string())); + } + + #[test] + fn test_extract_jwt_query_param() { + let headers = HeaderMap::new(); + let uri: axum::http::Uri = "/test?jwt=mytoken".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), Some("mytoken".to_string())); + } + + #[test] + fn test_extract_jwt_query_over_header() { + let mut headers = HeaderMap::new(); + headers.insert( + header::AUTHORIZATION, + "Bearer header_token".parse().unwrap(), + ); + let uri: axum::http::Uri = "/test?jwt=query_token".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), Some("query_token".to_string())); + } + + #[test] + fn test_extract_jwt_none() { + let headers = HeaderMap::new(); + let uri: axum::http::Uri = "/test".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), None); + } + + #[test] + fn test_handle_range_single() { + let data = b"hello world"; + let headers = HeaderMap::new(); + let resp = handle_range_request("bytes=0-4", data, headers, None); + assert_eq!(resp.status(), StatusCode::PARTIAL_CONTENT); + } + + #[test] + fn test_handle_range_invalid() { + let data = b"hello"; + let headers = HeaderMap::new(); + let resp = handle_range_request("bytes=999-1000", data, headers, None); + assert_eq!(resp.status(), StatusCode::RANGE_NOT_SATISFIABLE); + } + + #[tokio::test] + async fn test_stats_memory_handler_matches_go_memstatus_shape() { + let response = stats_memory_handler(Query(ReadQueryParams::default())).await; + assert_eq!(response.status(), StatusCode::OK); + + let body = axum::body::to_bytes(response.into_body(), usize::MAX) + .await + .unwrap(); + let payload: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let memory = payload.get("Memory").unwrap(); + + for key in ["goroutines", "all", "used", "free", "self", "heap", "stack"] { + assert!(memory.get(key).is_some(), "missing key {}", key); + } + } + + #[tokio::test] + async fn test_stats_counter_handler_matches_go_json_shape() { + super::super::server_stats::reset_for_tests(); + super::super::server_stats::record_read_request(); + + let response = stats_counter_handler(Query(ReadQueryParams::default())).await; + assert_eq!(response.status(), StatusCode::OK); + + let body = axum::body::to_bytes(response.into_body(), usize::MAX) + .await + .unwrap(); + let payload: serde_json::Value = serde_json::from_slice(&body).unwrap(); + + assert_eq!( + payload.get("Version").and_then(|value| value.as_str()), + Some(crate::version::version()) + ); + let counters = payload.get("Counters").unwrap(); + assert!(counters.get("ReadRequests").is_some()); + assert!(counters.get("Requests").is_some()); + } + + #[test] + fn test_is_compressible_file_type() { + // Text types + assert!(is_compressible_file_type("", "text/html")); + assert!(is_compressible_file_type("", "text/plain")); + assert!(is_compressible_file_type("", "text/css")); + + // Compressible by extension + assert!(is_compressible_file_type(".svg", "")); + assert!(is_compressible_file_type(".bmp", "")); + assert!(is_compressible_file_type(".js", "")); + assert!(is_compressible_file_type(".json", "")); + assert!(is_compressible_file_type(".html", "")); + assert!(is_compressible_file_type(".css", "")); + assert!(is_compressible_file_type(".c", "")); + assert!(is_compressible_file_type(".go", "")); + + // Already compressed — should NOT compress + assert!(!is_compressible_file_type(".zip", "")); + assert!(!is_compressible_file_type(".gz", "")); + assert!(!is_compressible_file_type(".jpg", "")); + assert!(!is_compressible_file_type(".png", "")); + assert!(!is_compressible_file_type("", "image/jpeg")); + assert!(!is_compressible_file_type("", "image/png")); + + // Application subtypes + assert!(is_compressible_file_type("", "application/xml")); + assert!(is_compressible_file_type("", "application/javascript")); + assert!(!is_compressible_file_type("", "application/zstd")); + assert!(!is_compressible_file_type("", "application/vnd.rar")); + + // Audio + assert!(is_compressible_file_type(".wav", "audio/wav")); + assert!(!is_compressible_file_type("", "audio/mpeg")); + + // Unknown + assert!(!is_compressible_file_type( + ".xyz", + "application/octet-stream" + )); + } + + #[test] + fn test_try_gzip_data() { + let data = b"hello world hello world hello world"; + let compressed = try_gzip_data(data); + assert!(compressed.is_some()); + let compressed = compressed.unwrap(); + // Compressed data should be different from original + assert!(!compressed.is_empty()); + + // Verify we can decompress it + use flate2::read::GzDecoder; + use std::io::Read; + let mut decoder = GzDecoder::new(&compressed[..]); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed).unwrap(); + assert_eq!(decompressed, data); + } + + #[test] + fn test_maybe_decompress_gzip() { + let data = b"gzip me"; + let compressed = try_gzip_data(data).unwrap(); + let decompressed = maybe_decompress_gzip(&compressed).unwrap(); + assert_eq!(decompressed, data); + assert!(maybe_decompress_gzip(data).is_none()); + } + + #[test] + fn test_parse_content_disposition_filename() { + assert_eq!( + parse_content_disposition_filename("attachment; filename=\"report.txt\""), + Some("report.txt".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("inline; name=\"hello.txt\""), + Some("hello.txt".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("name=foo.txt"), + Some("foo.txt".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("attachment; filename=\"C:\\\\path\\\\file.jpg\""), + Some("file.jpg".to_string()) + ); + assert_eq!(parse_content_disposition_filename("inline"), None); + } + + #[test] + fn test_streaming_chunk_size_respects_configured_read_buffer() { + assert_eq!( + streaming_chunk_size(4 * 1024 * 1024, 8 * 1024 * 1024), + 4 * 1024 * 1024 + ); + assert_eq!( + streaming_chunk_size(32 * 1024, 512 * 1024), + DEFAULT_STREAMING_CHUNK_SIZE + ); + assert_eq!( + streaming_chunk_size(8 * 1024 * 1024, 128 * 1024), + 128 * 1024 + ); + } + + #[test] + fn test_normalize_outgoing_http_url_rewrites_scheme() { + let url = normalize_outgoing_http_url( + "https", + "http://master.example.com:9333/dir/lookup?volumeId=7", + ) + .unwrap(); + assert_eq!(url, "https://master.example.com:9333/dir/lookup?volumeId=7"); + } + + #[test] + fn test_redirect_request_uses_outgoing_http_scheme() { + let info = ProxyRequestInfo { + original_headers: HeaderMap::new(), + original_query: "collection=photos&readDeleted=true".to_string(), + path: "/3,01637037d6".to_string(), + vid_str: "3".to_string(), + fid_str: "01637037d6".to_string(), + }; + let target = VolumeLocation { + url: "volume.internal:8080".to_string(), + public_url: "volume.public:8080".to_string(), + grpc_port: 18080, + }; + + let response = redirect_request(&info, &target, "https"); + assert_eq!(response.status(), StatusCode::MOVED_PERMANENTLY); + assert_eq!( + response.headers().get(header::LOCATION).unwrap(), + "https://volume.internal:8080/3,01637037d6?collection=photos&proxied=true" + ); + } +} diff --git a/seaweed-volume/src/server/heartbeat.rs b/seaweed-volume/src/server/heartbeat.rs new file mode 100644 index 000000000..6fcfd523c --- /dev/null +++ b/seaweed-volume/src/server/heartbeat.rs @@ -0,0 +1,1576 @@ +//! Heartbeat client: registers the volume server with the master. +//! +//! Implements the bidirectional streaming `SendHeartbeat` RPC to the master, +//! matching Go's `server/volume_grpc_client_to_master.go`. + +use std::collections::HashMap; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::time::Duration; + +use tokio::sync::broadcast; +use tracing::{error, info, warn}; + +use super::grpc_client::{build_grpc_endpoint, GRPC_MAX_MESSAGE_SIZE}; +use super::volume_server::VolumeServerState; +use crate::pb::master_pb; +use crate::pb::master_pb::seaweed_client::SeaweedClient; +use crate::pb::volume_server_pb; +use crate::remote_storage::s3_tier::{S3TierBackend, S3TierConfig}; +use crate::storage::store::Store; +use crate::storage::types::NeedleId; + +const DUPLICATE_UUID_RETRY_MESSAGE: &str = "duplicate UUIDs detected, retrying connection"; +const MAX_DUPLICATE_UUID_RETRIES: u32 = 3; + +/// Configuration for the heartbeat client. +pub struct HeartbeatConfig { + pub ip: String, + pub port: u16, + pub grpc_port: u16, + pub public_url: String, + pub data_center: String, + pub rack: String, + pub master_addresses: Vec, + pub pulse_seconds: u64, +} + +/// Run the heartbeat loop using VolumeServerState. +/// +/// Mirrors Go's `volume_grpc_client_to_master.go` heartbeat(): +/// - On leader redirect: sleep 3s, then connect directly to the new leader +/// - On duplicate UUID error: exponential backoff (2s, 4s, 8s), exit after 3 retries +/// - On other errors: sleep pulse interval, reset to seed master list iteration +pub async fn run_heartbeat_with_state( + config: HeartbeatConfig, + state: Arc, + mut shutdown_rx: broadcast::Receiver<()>, +) { + info!( + "Starting heartbeat to master nodes: {:?}", + config.master_addresses + ); + + let pulse = Duration::from_secs(config.pulse_seconds.max(1)); + let mut new_leader: Option = None; + let mut duplicate_retry_count: u32 = 0; + + loop { + for master_addr in &config.master_addresses { + if is_stopping(&state) { + state.is_heartbeating.store(false, Ordering::Relaxed); + info!("Heartbeat stopping"); + return; + } + if shutdown_rx.try_recv().is_ok() { + state.is_heartbeating.store(false, Ordering::Relaxed); + info!("Heartbeat shutting down"); + return; + } + + // If we have a leader redirect, sleep 3s then connect to the leader + // instead of iterating through the seed list + let target_addr = if let Some(ref leader) = new_leader { + tokio::time::sleep(Duration::from_secs(3)).await; + leader.clone() + } else { + master_addr.clone() + }; + + let grpc_addr = to_grpc_address(&target_addr); + info!("Connecting heartbeat to master {}", grpc_addr); + + // Determine what action to take after the heartbeat attempt. + // We convert the error to a string immediately so the non-Send + // Box is dropped before any .await point. + enum PostAction { + LeaderRedirect(String), + Done, + SleepDuplicate(Duration), + SleepPulse, + } + let action = match do_heartbeat(&config, &state, &grpc_addr, &target_addr, pulse, &mut shutdown_rx) + .await + { + Ok(Some(leader)) => { + info!("Master leader changed to {}", leader); + PostAction::LeaderRedirect(leader) + } + Ok(None) => { + duplicate_retry_count = 0; + PostAction::Done + } + Err(e) => { + let err_msg = e.to_string(); + // Drop `e` (non-Send) before any .await + drop(e); + warn!("Heartbeat to {} error: {}", grpc_addr, err_msg); + + if err_msg.contains(DUPLICATE_UUID_RETRY_MESSAGE) { + if duplicate_retry_count >= MAX_DUPLICATE_UUID_RETRIES { + error!("Shut down Volume Server due to persistent duplicate volume directories after 3 retries"); + error!( + "Please check if another volume server is using the same directory" + ); + std::process::exit(1); + } + let retry_delay = duplicate_uuid_retry_delay(duplicate_retry_count); + duplicate_retry_count += 1; + warn!( + "Waiting {:?} before retrying due to duplicate UUID detection (attempt {}/3)...", + retry_delay, duplicate_retry_count + ); + PostAction::SleepDuplicate(retry_delay) + } else { + duplicate_retry_count = 0; + PostAction::SleepPulse + } + } + }; + + match action { + PostAction::LeaderRedirect(leader) => { + new_leader = Some(leader); + break; + } + PostAction::Done => { + new_leader = None; + } + PostAction::SleepDuplicate(delay) => { + new_leader = None; + tokio::time::sleep(delay).await; + } + PostAction::SleepPulse => { + new_leader = None; + tokio::time::sleep(pulse).await; + } + } + + // If we connected to a leader (not seed list), break out after one attempt + // so we either reconnect to the new leader or fall back to seed list + if new_leader.is_some() { + break; + } + } + + // If we have a leader redirect, skip the sleep and reconnect immediately + if new_leader.is_some() { + continue; + } + + tokio::select! { + _ = tokio::time::sleep(pulse) => {} + _ = shutdown_rx.recv() => { + state.is_heartbeating.store(false, Ordering::Relaxed); + info!("Heartbeat shutting down"); + return; + } + } + } +} + +/// Convert a master address "host:port" to a gRPC host:port target. +/// The Go master uses port + 10000 for gRPC by default. +pub fn to_grpc_address(master_addr: &str) -> String { + if let Some((host, port_str)) = master_addr.rsplit_once(':') { + if let Ok(port) = port_str.parse::() { + let grpc_port = port + 10000; + return format!("{}:{}", host, grpc_port); + } + } + master_addr.to_string() +} + +/// Call GetMasterConfiguration on seed masters before starting the heartbeat loop. +/// Mirrors Go's `checkWithMaster()` in `volume_grpc_client_to_master.go`. +/// Retries across all seed masters with a 1790ms sleep between rounds (matching Go). +/// Stores metrics address/interval from the response into server state. +async fn check_with_master(config: &HeartbeatConfig, state: &Arc) { + loop { + for master_addr in &config.master_addresses { + let grpc_addr = to_grpc_address(master_addr); + match try_get_master_configuration(&grpc_addr, state.outgoing_grpc_tls.as_ref()).await { + Ok(resp) => { + let changed = apply_metrics_push_settings( + state, + &resp.metrics_address, + resp.metrics_interval_seconds, + ); + if changed { + state.metrics_notify.notify_waiters(); + } + apply_storage_backends(state, &resp.storage_backends); + info!( + "Got master configuration from {}: metrics_address={}, metrics_interval={}s", + master_addr, resp.metrics_address, resp.metrics_interval_seconds + ); + return; + } + Err(e) => { + warn!("checkWithMaster {}: {}", master_addr, e); + } + } + } + tokio::time::sleep(Duration::from_millis(1790)).await; + } +} + +pub async fn prime_master_configuration(config: &HeartbeatConfig, state: &Arc) { + check_with_master(config, state).await; +} + +pub async fn try_get_master_configuration( + grpc_addr: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result> { + let channel = build_grpc_endpoint(grpc_addr, tls)? + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(10)) + .connect() + .await?; + let mut client = SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .get_master_configuration(master_pb::GetMasterConfigurationRequest {}) + .await?; + Ok(resp.into_inner()) +} + +fn is_stopping(state: &VolumeServerState) -> bool { + *state.is_stopping.read().unwrap() +} + +fn duplicate_uuid_retry_delay(retry_count: u32) -> Duration { + Duration::from_secs((1u64 << retry_count) * 2) +} + +fn duplicate_directories(store: &Store, duplicated_uuids: &[String]) -> Vec { + let mut duplicate_dirs = Vec::new(); + for loc in &store.locations { + if duplicated_uuids + .iter() + .any(|uuid| uuid == &loc.directory_uuid) + { + duplicate_dirs.push(loc.directory.clone()); + } + } + duplicate_dirs +} + +fn apply_master_volume_options(store: &Store, hb_resp: &master_pb::HeartbeatResponse) -> bool { + let mut volume_opts_changed = false; + if store.get_preallocate() != hb_resp.preallocate { + store.set_preallocate(hb_resp.preallocate); + volume_opts_changed = true; + } + if hb_resp.volume_size_limit > 0 + && store.volume_size_limit.load(Ordering::Relaxed) != hb_resp.volume_size_limit + { + store + .volume_size_limit + .store(hb_resp.volume_size_limit, Ordering::Relaxed); + volume_opts_changed = true; + } + + volume_opts_changed && store.maybe_adjust_volume_max() +} + +type EcShardDeltaKey = (u32, String, u32, u32); + +fn collect_ec_shard_delta_messages( + store: &Store, +) -> HashMap { + let mut messages = HashMap::new(); + + for (disk_id, loc) in store.locations.iter().enumerate() { + for (_, ec_vol) in loc.ec_volumes() { + for shard in ec_vol.shards.iter().flatten() { + messages.insert( + ( + ec_vol.volume_id.0, + ec_vol.collection.clone(), + disk_id as u32, + shard.shard_id as u32, + ), + master_pb::VolumeEcShardInformationMessage { + id: ec_vol.volume_id.0, + collection: ec_vol.collection.clone(), + ec_index_bits: 1u32 << shard.shard_id, + shard_sizes: vec![shard.file_size()], + disk_type: ec_vol.disk_type.to_string(), + expire_at_sec: ec_vol.expire_at_sec, + disk_id: disk_id as u32, + ..Default::default() + }, + ); + } + } + } + + messages +} + +fn diff_ec_shard_delta_messages( + previous: &HashMap, + current: &HashMap, +) -> ( + Vec, + Vec, +) { + let mut new_ec_shards = Vec::new(); + let mut deleted_ec_shards = Vec::new(); + + for (key, message) in current { + if previous.get(key) != Some(message) { + new_ec_shards.push(message.clone()); + } + } + + for (key, message) in previous { + if !current.contains_key(key) { + let mut deleted = message.clone(); + deleted.shard_sizes = vec![0]; + deleted_ec_shards.push(deleted); + } + } + + (new_ec_shards, deleted_ec_shards) +} + +/// Perform one heartbeat session with a master server. +async fn do_heartbeat( + config: &HeartbeatConfig, + state: &Arc, + grpc_addr: &str, + current_master: &str, + pulse: Duration, + shutdown_rx: &mut broadcast::Receiver<()>, +) -> Result, Box> { + let channel = build_grpc_endpoint(grpc_addr, state.outgoing_grpc_tls.as_ref())? + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(30)) + .connect() + .await?; + + let mut client = SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + let (tx, rx) = tokio::sync::mpsc::channel::(32); + + // Keep track of what we sent, to generate delta updates + let initial_hb = collect_heartbeat(config, state); + let mut last_volumes: HashMap = initial_hb + .volumes + .iter() + .map(|v| (v.id, v.clone())) + .collect(); + let mut last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + + // Send initial heartbeats BEFORE calling send_heartbeat to avoid deadlock: + // the server won't send response headers until it receives the first message, + // but send_heartbeat().await waits for response headers. + tx.send(initial_hb).await?; + tx.send(collect_ec_heartbeat(config, state)).await?; + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + let mut response_stream = client.send_heartbeat(stream).await?.into_inner(); + + info!("Heartbeat stream established with {}", grpc_addr); + if is_stopping(state) { + state.is_heartbeating.store(false, Ordering::Relaxed); + send_deregister_heartbeat(config, state, &tx).await; + info!("Heartbeat stopping"); + return Ok(None); + } + state.is_heartbeating.store(true, Ordering::Relaxed); + + let mut volume_tick = tokio::time::interval(pulse); + let mut ec_tick = tokio::time::interval(pulse * 17); + volume_tick.tick().await; + ec_tick.tick().await; + + loop { + tokio::select! { + resp = response_stream.message() => { + match resp { + Ok(Some(hb_resp)) => { + // Match Go ordering: DuplicatedUuids first, then volume + // options, then leader redirect. + if !hb_resp.duplicated_uuids.is_empty() { + let duplicate_dirs = { + let store = state.store.read().unwrap(); + duplicate_directories(&store, &hb_resp.duplicated_uuids) + }; + error!( + "Master reported duplicate volume directories: {:?}", + duplicate_dirs + ); + return Err(format!( + "{}: {:?}", + DUPLICATE_UUID_RETRY_MESSAGE, duplicate_dirs + ) + .into()); + } + let changed = { + let s = state.store.read().unwrap(); + apply_master_volume_options(&s, &hb_resp) + }; + if changed { + let adjusted_hb = collect_heartbeat(config, state); + last_volumes = + adjusted_hb.volumes.iter().map(|v| (v.id, v.clone())).collect(); + last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + if tx.send(adjusted_hb).await.is_err() { + return Ok(None); + } + } + let metrics_changed = apply_metrics_push_settings( + state, + &hb_resp.metrics_address, + hb_resp.metrics_interval_seconds, + ); + if metrics_changed { + state.metrics_notify.notify_waiters(); + } + // Match Go: only redirect if leader is non-empty AND + // different from the current master we're connected to. + if !hb_resp.leader.is_empty() && current_master != hb_resp.leader { + return Ok(Some(hb_resp.leader)); + } + } + Ok(None) => return Ok(None), + Err(e) => return Err(Box::new(e)), + } + } + + _ = volume_tick.tick() => { + { + let s = state.store.read().unwrap(); + s.maybe_adjust_volume_max(); + } + let current_hb = collect_heartbeat(config, state); + last_volumes = current_hb.volumes.iter().map(|v| (v.id, v.clone())).collect(); + last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + if tx.send(current_hb).await.is_err() { + return Ok(None); + } + } + + _ = ec_tick.tick() => { + let current_ec_hb = collect_ec_heartbeat(config, state); + last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + if tx.send(current_ec_hb).await.is_err() { + return Ok(None); + } + } + + _ = state.volume_state_notify.notified() => { + if is_stopping(state) { + state.is_heartbeating.store(false, Ordering::Relaxed); + send_deregister_heartbeat(config, state, &tx).await; + info!("Heartbeat stopping"); + return Ok(None); + } + let current_hb = collect_heartbeat(config, state); + let current_volumes: HashMap = current_hb.volumes.iter().map(|v| (v.id, v.clone())).collect(); + let current_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + + let mut new_vols = Vec::new(); + let mut del_vols = Vec::new(); + + for (id, vol) in ¤t_volumes { + if !last_volumes.contains_key(id) { + new_vols.push(master_pb::VolumeShortInformationMessage { + id: *id, + collection: vol.collection.clone(), + version: vol.version, + replica_placement: vol.replica_placement, + ttl: vol.ttl, + disk_type: vol.disk_type.clone(), + disk_id: vol.disk_id, + }); + } + } + + for (id, vol) in &last_volumes { + if !current_volumes.contains_key(id) { + del_vols.push(master_pb::VolumeShortInformationMessage { + id: *id, + collection: vol.collection.clone(), + version: vol.version, + replica_placement: vol.replica_placement, + ttl: vol.ttl, + disk_type: vol.disk_type.clone(), + disk_id: vol.disk_id, + }); + } + } + + let (new_ec_shards, deleted_ec_shards) = + diff_ec_shard_delta_messages(&last_ec_shards, ¤t_ec_shards); + + // Collect current state for state-only or combined delta heartbeats. + // Mirrors Go's StateUpdateChan case which sends state changes immediately. + let current_state = Some(volume_server_pb::VolumeServerState { + maintenance: state.maintenance.load(Ordering::Relaxed), + version: state.state_version.load(Ordering::Relaxed), + }); + + if !new_vols.is_empty() + || !del_vols.is_empty() + || !new_ec_shards.is_empty() + || !deleted_ec_shards.is_empty() + { + let delta_hb = master_pb::Heartbeat { + ip: config.ip.clone(), + port: config.port as u32, + grpc_port: config.grpc_port as u32, + public_url: config.public_url.clone(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + new_volumes: new_vols, + deleted_volumes: del_vols, + new_ec_shards, + deleted_ec_shards, + state: current_state, + ..Default::default() + }; + if tx.send(delta_hb).await.is_err() { + return Ok(None); + } + last_volumes = current_volumes; + last_ec_shards = current_ec_shards; + } else { + // State-only heartbeat (e.g., MarkReadonly/MarkWritable changed state + // without adding/removing volumes). Mirrors Go's StateUpdateChan case. + let state_hb = master_pb::Heartbeat { + ip: config.ip.clone(), + port: config.port as u32, + grpc_port: config.grpc_port as u32, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + state: current_state, + ..Default::default() + }; + if tx.send(state_hb).await.is_err() { + return Ok(None); + } + } + } + + _ = shutdown_rx.recv() => { + state.is_heartbeating.store(false, Ordering::Relaxed); + send_deregister_heartbeat(config, state, &tx).await; + info!("Sent deregistration heartbeat"); + return Ok(None); + } + } + } +} + +async fn send_deregister_heartbeat( + config: &HeartbeatConfig, + state: &Arc, + tx: &tokio::sync::mpsc::Sender, +) { + let empty = { + let store = state.store.read().unwrap(); + let (location_uuids, disk_tags) = collect_location_metadata(&store); + master_pb::Heartbeat { + id: store.id.clone(), + ip: config.ip.clone(), + port: config.port as u32, + public_url: config.public_url.clone(), + max_file_key: 0, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + has_no_volumes: true, + has_no_ec_shards: true, + grpc_port: config.grpc_port as u32, + location_uuids, + disk_tags, + ..Default::default() + } + }; + let _ = tx.send(empty).await; + tokio::time::sleep(Duration::from_millis(200)).await; +} + +fn apply_metrics_push_settings( + state: &VolumeServerState, + address: &str, + interval_seconds: u32, +) -> bool { + let mut runtime = state.metrics_runtime.write().unwrap(); + if runtime.push_gateway.address == address + && runtime.push_gateway.interval_seconds == interval_seconds + { + return false; + } + runtime.push_gateway.address = address.to_string(); + runtime.push_gateway.interval_seconds = interval_seconds; + true +} + +fn apply_storage_backends( + state: &VolumeServerState, + storage_backends: &[master_pb::StorageBackend], +) { + if storage_backends.is_empty() { + return; + } + + let mut registry = state.s3_tier_registry.write().unwrap(); + let mut global_registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap(); + for backend in storage_backends { + if backend.r#type != "s3" { + continue; + } + + let properties = &backend.properties; + let config = S3TierConfig { + access_key: properties + .get("aws_access_key_id") + .cloned() + .unwrap_or_default(), + secret_key: properties + .get("aws_secret_access_key") + .cloned() + .unwrap_or_default(), + region: properties.get("region").cloned().unwrap_or_default(), + bucket: properties.get("bucket").cloned().unwrap_or_default(), + endpoint: properties.get("endpoint").cloned().unwrap_or_default(), + storage_class: properties.get("storage_class").cloned().unwrap_or_default(), + force_path_style: parse_bool_property(properties.get("force_path_style")), + }; + + let backend_id = if backend.id.is_empty() { + "default" + } else { + backend.id.as_str() + }; + register_s3_backend(&mut registry, backend, backend_id, &config); + register_s3_backend(&mut global_registry, backend, backend_id, &config); + } +} + +fn register_s3_backend( + registry: &mut crate::remote_storage::s3_tier::S3TierRegistry, + backend: &master_pb::StorageBackend, + backend_id: &str, + config: &S3TierConfig, +) { + let qualified_name = format!("{}.{}", backend.r#type, backend_id); + if registry.get(&qualified_name).is_none() { + registry.register(qualified_name, S3TierBackend::new(config)); + } + if backend_id == "default" && registry.get(&backend.r#type).is_none() { + registry.register(backend.r#type.clone(), S3TierBackend::new(config)); + } +} + +fn parse_bool_property(value: Option<&String>) -> bool { + value + .map(|v| { + matches!( + v.trim().to_ascii_lowercase().as_str(), + "1" | "t" | "true" | "y" | "yes" | "on" + ) + }) + .unwrap_or(true) +} + +/// Collect volume information into a Heartbeat message. +fn collect_heartbeat( + config: &HeartbeatConfig, + state: &Arc, +) -> master_pb::Heartbeat { + let mut store = state.store.write().unwrap(); + let (ec_shards, deleted_ec_shards) = store.delete_expired_ec_volumes(); + build_heartbeat_with_ec_status( + config, + &mut store, + deleted_ec_shards, + ec_shards.is_empty(), + ) +} + +fn collect_location_metadata(store: &Store) -> (Vec, Vec) { + let location_uuids = store + .locations + .iter() + .map(|loc| loc.directory_uuid.clone()) + .collect(); + let disk_tags = store + .locations + .iter() + .enumerate() + .map(|(disk_id, loc)| master_pb::DiskTag { + disk_id: disk_id as u32, + tags: loc.tags.clone(), + }) + .collect(); + (location_uuids, disk_tags) +} + +#[cfg(test)] +fn build_heartbeat(config: &HeartbeatConfig, store: &mut Store) -> master_pb::Heartbeat { + let has_no_ec_shards = collect_live_ec_shards(store, false).is_empty(); + build_heartbeat_with_ec_status(config, store, Vec::new(), has_no_ec_shards) +} + +fn build_heartbeat_with_ec_status( + config: &HeartbeatConfig, + store: &mut Store, + deleted_ec_shards: Vec, + has_no_ec_shards: bool, +) -> master_pb::Heartbeat { + const MAX_TTL_VOLUME_REMOVAL_DELAY: u32 = 10; + + #[derive(Default)] + struct ReadOnlyCounts { + is_read_only: u32, + no_write_or_delete: u32, + no_write_can_delete: u32, + is_disk_space_low: u32, + } + + let mut volumes = Vec::new(); + let mut max_file_key = NeedleId(0); + let mut max_volume_counts: HashMap = HashMap::new(); + + // Collect per-collection disk size and read-only counts for metrics + let mut disk_sizes: HashMap = HashMap::new(); // (normal, deleted) + let mut ro_counts: HashMap = HashMap::new(); + + let volume_size_limit = store.volume_size_limit.load(Ordering::Relaxed); + + for (disk_id, loc) in store.locations.iter_mut().enumerate() { + let disk_type_str = loc.disk_type.to_string(); + let mut effective_max_count = loc.max_volume_count.load(Ordering::Relaxed); + if loc.is_disk_space_low.load(Ordering::Relaxed) { + let used_slots = loc.volumes_len() as i32 + + ((loc.ec_shard_count() + + crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT + - 1) + / crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT) + as i32; + effective_max_count = used_slots; + } + if effective_max_count < 0 { + effective_max_count = 0; + } + *max_volume_counts.entry(disk_type_str).or_insert(0) += effective_max_count as u32; + + let mut delete_vids = Vec::new(); + for (_, vol) in loc.iter_volumes() { + let cur_max = vol.max_file_key(); + if cur_max > max_file_key { + max_file_key = cur_max; + } + + let volume_size = vol.dat_file_size().unwrap_or(0); + let mut should_delete_volume = false; + + if vol.last_io_error().is_some() { + delete_vids.push(vol.id); + should_delete_volume = true; + } else if !vol.is_expired(volume_size, volume_size_limit) { + let (remote_storage_name, remote_storage_key) = vol.remote_storage_name_key(); + volumes.push(master_pb::VolumeInformationMessage { + id: vol.id.0, + size: volume_size, + collection: vol.collection.clone(), + file_count: vol.file_count() as u64, + delete_count: vol.deleted_count() as u64, + deleted_byte_count: vol.deleted_size(), + read_only: vol.is_read_only(), + replica_placement: vol.super_block.replica_placement.to_byte() as u32, + version: vol.super_block.version.0 as u32, + ttl: vol.super_block.ttl.to_u32(), + compact_revision: vol.last_compact_revision() as u32, + modified_at_second: vol.last_modified_ts() as i64, + disk_type: loc.disk_type.to_string(), + disk_id: disk_id as u32, + remote_storage_name, + remote_storage_key, + ..Default::default() + }); + } else if vol.is_expired_long_enough(MAX_TTL_VOLUME_REMOVAL_DELAY) { + delete_vids.push(vol.id); + should_delete_volume = true; + } + + // Track disk size by collection + let entry = disk_sizes.entry(vol.collection.clone()).or_insert((0, 0)); + if !should_delete_volume { + entry.0 += volume_size; + entry.1 += vol.deleted_size(); + } + + let read_only = ro_counts.entry(vol.collection.clone()).or_default(); + if !should_delete_volume && vol.is_read_only() { + read_only.is_read_only += 1; + if vol.is_no_write_or_delete() { + read_only.no_write_or_delete += 1; + } + if vol.is_no_write_can_delete() { + read_only.no_write_can_delete += 1; + } + if loc.is_disk_space_low.load(Ordering::Relaxed) { + read_only.is_disk_space_low += 1; + } + } + + } + + for vid in delete_vids { + let _ = loc.delete_volume(vid, false); + } + } + + // Update disk size and read-only gauges + for (col, (normal, deleted)) in &disk_sizes { + crate::metrics::DISK_SIZE_GAUGE + .with_label_values(&[col, crate::metrics::DISK_SIZE_LABEL_NORMAL]) + .set(*normal as f64); + crate::metrics::DISK_SIZE_GAUGE + .with_label_values(&[col, crate::metrics::DISK_SIZE_LABEL_DELETED_BYTES]) + .set(*deleted as f64); + } + for (col, counts) in &ro_counts { + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_IS_READ_ONLY]) + .set(counts.is_read_only as f64); + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_NO_WRITE_OR_DELETE]) + .set(counts.no_write_or_delete as f64); + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_NO_WRITE_CAN_DELETE]) + .set(counts.no_write_can_delete as f64); + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_IS_DISK_SPACE_LOW]) + .set(counts.is_disk_space_low as f64); + } + // Update max volumes gauge + let total_max: i64 = max_volume_counts.values().map(|v| *v as i64).sum(); + crate::metrics::MAX_VOLUMES.set(total_max); + + let has_no_volumes = volumes.is_empty(); + let (location_uuids, disk_tags) = collect_location_metadata(store); + + master_pb::Heartbeat { + id: store.id.clone(), + ip: config.ip.clone(), + port: config.port as u32, + public_url: config.public_url.clone(), + max_file_key: max_file_key.0, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + admin_port: config.port as u32, + volumes, + deleted_ec_shards, + has_no_volumes, + has_no_ec_shards, + max_volume_counts, + grpc_port: config.grpc_port as u32, + location_uuids, + disk_tags, + ..Default::default() + } +} + +fn collect_live_ec_shards( + store: &Store, + update_metrics: bool, +) -> Vec { + let mut ec_shards = Vec::new(); + let mut ec_sizes: HashMap = HashMap::new(); + + for (disk_id, loc) in store.locations.iter().enumerate() { + for (_, ec_vol) in loc.ec_volumes() { + for message in ec_vol.to_volume_ec_shard_information_messages(disk_id as u32) { + if update_metrics { + let total_size: u64 = message + .shard_sizes + .iter() + .map(|size| (*size).max(0) as u64) + .sum(); + *ec_sizes.entry(message.collection.clone()).or_insert(0) += total_size; + } + ec_shards.push(message); + } + } + } + + if update_metrics { + for (col, size) in &ec_sizes { + crate::metrics::DISK_SIZE_GAUGE + .with_label_values(&[col, crate::metrics::DISK_SIZE_LABEL_EC]) + .set(*size as f64); + } + } + + ec_shards +} + +/// Collect EC shard information into a Heartbeat message. +fn collect_ec_heartbeat(config: &HeartbeatConfig, state: &Arc) -> master_pb::Heartbeat { + let store = state.store.read().unwrap(); + let ec_shards = collect_live_ec_shards(&store, true); + + let has_no = ec_shards.is_empty(); + master_pb::Heartbeat { + ip: config.ip.clone(), + port: config.port as u32, + grpc_port: config.grpc_port as u32, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + ec_shards, + has_no_ec_shards: has_no, + ..Default::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::MinFreeSpace; + use crate::config::ReadMode; + use crate::metrics::{ + DISK_SIZE_GAUGE, DISK_SIZE_LABEL_DELETED_BYTES, DISK_SIZE_LABEL_EC, + DISK_SIZE_LABEL_NORMAL, READ_ONLY_LABEL_IS_DISK_SPACE_LOW, + READ_ONLY_LABEL_IS_READ_ONLY, READ_ONLY_LABEL_NO_WRITE_CAN_DELETE, + READ_ONLY_LABEL_NO_WRITE_OR_DELETE, READ_ONLY_VOLUME_GAUGE, + }; + use crate::remote_storage::s3_tier::S3TierRegistry; + use crate::security::{Guard, SigningKey}; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::types::{DiskType, Version, VolumeId}; + use std::sync::atomic::Ordering; + use std::sync::RwLock; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn test_config() -> HeartbeatConfig { + HeartbeatConfig { + ip: "127.0.0.1".to_string(), + port: 8080, + grpc_port: 18080, + public_url: "127.0.0.1:8080".to_string(), + data_center: "dc1".to_string(), + rack: "rack1".to_string(), + master_addresses: Vec::new(), + pulse_seconds: 5, + } + } + + fn test_state_with_store(store: Store) -> Arc { + Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(Guard::new( + &[], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + )), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(false), + has_master: true, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new(S3TierRegistry::new()), + read_mode: ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new(Default::default()), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: true, + read_buffer_size_bytes: 4 * 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }) + } + + #[test] + fn test_build_heartbeat_includes_store_identity_and_disk_metadata() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store.id = "volume-node-a".to_string(); + store + .add_location( + dir, + dir, + 3, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + vec!["fast".to_string(), "ssd".to_string()], + ) + .unwrap(); + store + .add_volume( + VolumeId(7), + "pics", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert_eq!(heartbeat.id, "volume-node-a"); + assert_eq!(heartbeat.volumes.len(), 1); + assert!(!heartbeat.has_no_volumes); + assert_eq!( + heartbeat.location_uuids, + vec![store.locations[0].directory_uuid.clone()] + ); + assert_eq!(heartbeat.disk_tags.len(), 1); + assert_eq!(heartbeat.disk_tags[0].disk_id, 0); + assert_eq!( + heartbeat.disk_tags[0].tags, + vec!["fast".to_string(), "ssd".to_string()] + ); + } + + #[test] + fn test_build_heartbeat_marks_empty_store_as_has_no_volumes() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store.id = "volume-node-b".to_string(); + store + .add_location( + dir, + dir, + 2, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert!(heartbeat.volumes.is_empty()); + assert!(heartbeat.has_no_volumes); + } + + #[test] + fn test_build_heartbeat_tracks_go_read_only_labels_and_disk_id() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(17), + "heartbeat_metrics_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store.locations[0] + .is_disk_space_low + .store(true, Ordering::Relaxed); + + { + let (_, volume) = store.find_volume_mut(VolumeId(17)).unwrap(); + volume.set_read_only().unwrap(); + volume.volume_info.files.push(Default::default()); + volume.refresh_remote_write_mode(); + } + + let heartbeat = build_heartbeat(&test_config(), &mut store); + let collection = "heartbeat_metrics_case"; + let disk_type = store.locations[0].disk_type.to_string(); + + assert_eq!(heartbeat.volumes.len(), 1); + assert_eq!(heartbeat.volumes[0].disk_id, 0); + assert_eq!(heartbeat.max_volume_counts[&disk_type], 1); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_IS_READ_ONLY]) + .get(), + 1.0 + ); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_NO_WRITE_OR_DELETE]) + .get(), + 0.0 + ); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_NO_WRITE_CAN_DELETE]) + .get(), + 1.0 + ); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_IS_DISK_SPACE_LOW]) + .get(), + 1.0 + ); + assert_eq!( + DISK_SIZE_GAUGE + .with_label_values(&[collection, DISK_SIZE_LABEL_NORMAL]) + .get(), + crate::storage::super_block::SUPER_BLOCK_SIZE as f64 + ); + assert_eq!( + DISK_SIZE_GAUGE + .with_label_values(&[collection, DISK_SIZE_LABEL_DELETED_BYTES]) + .get(), + 0.0 + ); + } + + #[test] + fn test_collect_ec_heartbeat_sets_go_metadata_and_ec_metrics() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let shard_path = format!("{}/ec_metrics_case_27.ec00", dir); + std::fs::write(&shard_path, b"ec-shard").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(27), "ec_metrics_case", &[0]) + .unwrap(); + + let state = test_state_with_store(store); + let heartbeat = collect_ec_heartbeat(&test_config(), &state); + + assert_eq!(heartbeat.ec_shards.len(), 1); + assert!(!heartbeat.has_no_ec_shards); + assert_eq!(heartbeat.ec_shards[0].disk_id, 0); + assert_eq!( + heartbeat.ec_shards[0].disk_type, + state.store.read().unwrap().locations[0].disk_type.to_string() + ); + assert_eq!(heartbeat.ec_shards[0].ec_index_bits, 1); + assert_eq!(heartbeat.ec_shards[0].shard_sizes, vec![8]); + assert_eq!( + DISK_SIZE_GAUGE + .with_label_values(&["ec_metrics_case", DISK_SIZE_LABEL_EC]) + .get(), + 8.0 + ); + } + + #[test] + fn test_collect_heartbeat_deletes_expired_ec_volumes() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + std::fs::write(format!("{}/expired_heartbeat_ec_31.ec00", dir), b"expired").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(31), "expired_heartbeat_ec", &[0]) + .unwrap(); + store + .find_ec_volume_mut(VolumeId(31)) + .unwrap() + .expire_at_sec = 1; + + let state = test_state_with_store(store); + let heartbeat = collect_heartbeat(&test_config(), &state); + + assert!(heartbeat.has_no_ec_shards); + assert_eq!(heartbeat.deleted_ec_shards.len(), 1); + assert_eq!(heartbeat.deleted_ec_shards[0].id, 31); + assert!(!state.store.read().unwrap().has_ec_volume(VolumeId(31))); + } + + #[test] + fn test_collect_heartbeat_excludes_expired_volume_until_removal_delay() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store.volume_size_limit.store(1, Ordering::Relaxed); + store + .add_volume( + VolumeId(41), + "expired_volume_case", + None, + Some(crate::storage::needle::ttl::TTL::read("20m").unwrap()), + 1024, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + let dat_path = { + let (_, volume) = store.find_volume_mut(VolumeId(41)).unwrap(); + volume.set_last_io_error_for_test(None); + volume.set_last_modified_ts_for_test( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .saturating_sub(21 * 60), + ); + volume.dat_path() + }; + std::fs::OpenOptions::new() + .write(true) + .open(&dat_path) + .unwrap() + .set_len((crate::storage::super_block::SUPER_BLOCK_SIZE + 1) as u64) + .unwrap(); + let volume_size_limit = store.volume_size_limit.load(Ordering::Relaxed); + let (_, volume) = store.find_volume(VolumeId(41)).unwrap(); + assert!(volume.is_expired(volume.dat_file_size().unwrap_or(0), volume_size_limit)); + assert!(!volume.is_expired_long_enough(10)); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert!(heartbeat.volumes.is_empty()); + assert!(store.has_volume(VolumeId(41))); + } + + #[test] + fn test_collect_heartbeat_deletes_io_error_volume() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(51), + "io_error_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + let (_, volume) = store.find_volume_mut(VolumeId(51)).unwrap(); + volume.set_last_io_error_for_test(Some("input/output error")); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert!(heartbeat.volumes.is_empty()); + assert!(!store.has_volume(VolumeId(51))); + } + + #[test] + fn test_build_heartbeat_includes_remote_storage_name_and_key() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(71), + "remote_volume_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + let (_, volume) = store.find_volume_mut(VolumeId(71)).unwrap(); + volume.volume_info.files.push(crate::storage::volume::PbRemoteFile { + backend_type: "s3".to_string(), + backend_id: "archive".to_string(), + key: "volumes/71.dat".to_string(), + ..Default::default() + }); + volume.refresh_remote_write_mode(); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert_eq!(heartbeat.volumes.len(), 1); + assert_eq!(heartbeat.volumes[0].remote_storage_name, "s3.archive"); + assert_eq!(heartbeat.volumes[0].remote_storage_key, "volumes/71.dat"); + } + + #[test] + fn test_apply_storage_backends_registers_s3_default_aliases() { + let state = test_state_with_store(Store::new(NeedleMapKind::InMemory)); + crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap() + .clear(); + + apply_storage_backends( + &state, + &[master_pb::StorageBackend { + r#type: "s3".to_string(), + id: "default".to_string(), + properties: std::collections::HashMap::from([ + ("aws_access_key_id".to_string(), "access".to_string()), + ("aws_secret_access_key".to_string(), "secret".to_string()), + ("bucket".to_string(), "bucket-a".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ("endpoint".to_string(), "http://127.0.0.1:8333".to_string()), + ("storage_class".to_string(), "STANDARD".to_string()), + ("force_path_style".to_string(), "false".to_string()), + ]), + }], + ); + + let registry = state.s3_tier_registry.read().unwrap(); + assert!(registry.get("s3.default").is_some()); + assert!(registry.get("s3").is_some()); + let global_registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap(); + assert!(global_registry.get("s3.default").is_some()); + assert!(global_registry.get("s3").is_some()); + } + + #[test] + fn test_apply_storage_backends_ignores_unsupported_types() { + let state = test_state_with_store(Store::new(NeedleMapKind::InMemory)); + crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap() + .clear(); + + apply_storage_backends( + &state, + &[master_pb::StorageBackend { + r#type: "rclone".to_string(), + id: "default".to_string(), + properties: std::collections::HashMap::new(), + }], + ); + + let registry = state.s3_tier_registry.read().unwrap(); + assert!(registry.names().is_empty()); + let global_registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap(); + assert!(global_registry.names().is_empty()); + } + + #[test] + fn test_apply_metrics_push_settings_updates_runtime_state() { + let store = Store::new(NeedleMapKind::InMemory); + let state = test_state_with_store(store); + + assert!(apply_metrics_push_settings(&state, "pushgateway:9091", 15,)); + { + let runtime = state.metrics_runtime.read().unwrap(); + assert_eq!(runtime.push_gateway.address, "pushgateway:9091"); + assert_eq!(runtime.push_gateway.interval_seconds, 15); + } + + assert!(!apply_metrics_push_settings(&state, "pushgateway:9091", 15,)); + } + + #[test] + fn test_duplicate_uuid_retry_delay_matches_go_backoff() { + assert_eq!(duplicate_uuid_retry_delay(0), Duration::from_secs(2)); + assert_eq!(duplicate_uuid_retry_delay(1), Duration::from_secs(4)); + assert_eq!(duplicate_uuid_retry_delay(2), Duration::from_secs(8)); + } + + #[test] + fn test_duplicate_directories_maps_master_uuids_to_paths() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 1, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let duplicate_dirs = duplicate_directories( + &store, + &[ + store.locations[0].directory_uuid.clone(), + "missing-uuid".to_string(), + ], + ); + + assert_eq!(duplicate_dirs, vec![dir.to_string()]); + } + + #[test] + fn test_apply_master_volume_options_updates_preallocate_and_size_limit() { + let store = Store::new(NeedleMapKind::InMemory); + store.volume_size_limit.store(1024, Ordering::Relaxed); + + let changed = apply_master_volume_options( + &store, + &master_pb::HeartbeatResponse { + volume_size_limit: 2048, + preallocate: true, + ..Default::default() + }, + ); + + assert!(store.get_preallocate()); + assert_eq!(store.volume_size_limit.load(Ordering::Relaxed), 2048); + assert!(!changed); + } + + #[test] + fn test_diff_ec_shard_delta_messages_reports_mounts_and_unmounts() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let previous = collect_ec_shard_delta_messages(&store); + + std::fs::write(format!("{}/ec_delta_case_81.ec00", dir), b"delta").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(81), "ec_delta_case", &[0]) + .unwrap(); + let current = collect_ec_shard_delta_messages(&store); + let (new_ec_shards, deleted_ec_shards) = + diff_ec_shard_delta_messages(&previous, ¤t); + + assert_eq!(new_ec_shards.len(), 1); + assert!(deleted_ec_shards.is_empty()); + assert_eq!(new_ec_shards[0].ec_index_bits, 1); + assert_eq!(new_ec_shards[0].shard_sizes, vec![5]); + + let (new_after_delete, deleted_after_delete) = + diff_ec_shard_delta_messages(¤t, &HashMap::new()); + assert!(new_after_delete.is_empty()); + assert_eq!(deleted_after_delete.len(), 1); + assert_eq!(deleted_after_delete[0].ec_index_bits, 1); + assert_eq!(deleted_after_delete[0].shard_sizes, vec![0]); + } +} diff --git a/seaweed-volume/src/server/memory_status.rs b/seaweed-volume/src/server/memory_status.rs new file mode 100644 index 000000000..92886465f --- /dev/null +++ b/seaweed-volume/src/server/memory_status.rs @@ -0,0 +1,102 @@ +use crate::pb::volume_server_pb; + +pub fn collect_mem_status() -> volume_server_pb::MemStatus { + #[allow(unused_mut)] + let mut mem = volume_server_pb::MemStatus { + goroutines: 1, + ..Default::default() + }; + + #[cfg(target_os = "linux")] + { + if let Some((all, free)) = get_system_memory_linux() { + mem.all = all; + mem.free = free; + mem.used = all.saturating_sub(free); + } + + if let Some(status) = read_process_status_linux() { + if status.threads > 0 { + mem.goroutines = status.threads as i32; + } + if let Some(rss) = status.rss { + mem.self_ = rss; + } + if let Some(heap) = status.data.or(status.rss) { + mem.heap = heap; + } + if let Some(stack) = status.stack { + mem.stack = stack; + } + } + } + + mem +} + +#[cfg(target_os = "linux")] +fn get_system_memory_linux() -> Option<(u64, u64)> { + unsafe { + let mut info: libc::sysinfo = std::mem::zeroed(); + if libc::sysinfo(&mut info) == 0 { + let unit = info.mem_unit as u64; + let total = info.totalram as u64 * unit; + let free = info.freeram as u64 * unit; + return Some((total, free)); + } + } + None +} + +#[cfg(target_os = "linux")] +#[derive(Default)] +struct ProcessStatus { + threads: u64, + rss: Option, + data: Option, + stack: Option, +} + +#[cfg(target_os = "linux")] +fn read_process_status_linux() -> Option { + let status = std::fs::read_to_string("/proc/self/status").ok()?; + let mut out = ProcessStatus::default(); + + for line in status.lines() { + if let Some(value) = line.strip_prefix("Threads:") { + out.threads = value.trim().parse().ok()?; + continue; + } + if let Some(value) = parse_proc_status_kib_field(line, "VmRSS:") { + out.rss = Some(value); + continue; + } + if let Some(value) = parse_proc_status_kib_field(line, "VmData:") { + out.data = Some(value); + continue; + } + if let Some(value) = parse_proc_status_kib_field(line, "VmStk:") { + out.stack = Some(value); + } + } + + Some(out) +} + +#[cfg(target_os = "linux")] +fn parse_proc_status_kib_field(line: &str, prefix: &str) -> Option { + let raw = line.strip_prefix(prefix)?.trim(); + let value = raw.strip_suffix(" kB").unwrap_or(raw).trim(); + value.parse::().ok().map(|kib| kib * 1024) +} + +#[cfg(test)] +mod tests { + use super::collect_mem_status; + + #[test] + fn test_collect_mem_status_reports_live_process_state() { + let mem = collect_mem_status(); + assert!(mem.goroutines > 0); + } +} diff --git a/seaweed-volume/src/server/mod.rs b/seaweed-volume/src/server/mod.rs new file mode 100644 index 000000000..6103b4980 --- /dev/null +++ b/seaweed-volume/src/server/mod.rs @@ -0,0 +1,12 @@ +pub mod debug; +pub mod grpc_client; +pub mod grpc_server; +pub mod handlers; +pub mod heartbeat; +pub mod memory_status; +pub mod profiling; +pub mod request_id; +pub mod server_stats; +pub mod ui; +pub mod volume_server; +pub mod write_queue; diff --git a/seaweed-volume/src/server/profiling.rs b/seaweed-volume/src/server/profiling.rs new file mode 100644 index 000000000..1965d227f --- /dev/null +++ b/seaweed-volume/src/server/profiling.rs @@ -0,0 +1,187 @@ +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; + +use pprof::protos::Message; + +use crate::config::VolumeServerConfig; + +const GO_CPU_PROFILE_FREQUENCY: i32 = 100; +const GO_PPROF_BLOCKLIST: [&str; 4] = ["libc", "libgcc", "pthread", "vdso"]; + +pub struct CpuProfileSession { + output_path: PathBuf, + guard: pprof::ProfilerGuard<'static>, +} + +impl CpuProfileSession { + pub fn start(config: &VolumeServerConfig) -> Result, String> { + if config.cpu_profile.is_empty() { + if !config.mem_profile.is_empty() && !config.pprof { + tracing::warn!( + "--memprofile is not yet supported in the Rust volume server; ignoring '{}'", + config.mem_profile + ); + } + return Ok(None); + } + + if config.pprof { + tracing::info!( + "--pprof is enabled; ignoring --cpuprofile '{}' and --memprofile '{}'", + config.cpu_profile, + config.mem_profile + ); + return Ok(None); + } + + if !config.mem_profile.is_empty() { + tracing::warn!( + "--memprofile is not yet supported in the Rust volume server; only --cpuprofile '{}' will be written", + config.cpu_profile + ); + } + + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(GO_CPU_PROFILE_FREQUENCY) + .blocklist(&GO_PPROF_BLOCKLIST) + .build() + .map_err(|e| { + format!( + "Failed to start CPU profiler '{}': {}", + config.cpu_profile, e + ) + })?; + + Ok(Some(Self { + output_path: PathBuf::from(&config.cpu_profile), + guard, + })) + } + + pub fn finish(self) -> Result<(), String> { + let report = self + .guard + .report() + .build() + .map_err(|e| format!("Failed to build CPU profile report: {}", e))?; + let profile = report + .pprof() + .map_err(|e| format!("Failed to encode CPU profile report: {}", e))?; + + let mut bytes = Vec::new(); + profile + .encode(&mut bytes) + .map_err(|e| format!("Failed to serialize CPU profile report: {}", e))?; + + let mut file = File::create(&self.output_path).map_err(|e| { + format!( + "Failed to create CPU profile '{}': {}", + self.output_path.display(), + e + ) + })?; + file.write_all(&bytes).map_err(|e| { + format!( + "Failed to write CPU profile '{}': {}", + self.output_path.display(), + e + ) + })?; + file.flush().map_err(|e| { + format!( + "Failed to flush CPU profile '{}': {}", + self.output_path.display(), + e + ) + })?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::CpuProfileSession; + use crate::config::{NeedleMapKind, ReadMode, VolumeServerConfig}; + use crate::security::tls::TlsPolicy; + + fn sample_config() -> VolumeServerConfig { + VolumeServerConfig { + port: 8080, + grpc_port: 18080, + public_port: 8080, + ip: "127.0.0.1".to_string(), + bind_ip: "127.0.0.1".to_string(), + public_url: "127.0.0.1:8080".to_string(), + id: "127.0.0.1:8080".to_string(), + masters: vec![], + pre_stop_seconds: 0, + idle_timeout: 0, + data_center: String::new(), + rack: String::new(), + index_type: NeedleMapKind::InMemory, + disk_type: String::new(), + folders: vec!["/tmp".to_string()], + folder_max_limits: vec![8], + folder_tags: vec![vec![]], + min_free_spaces: vec![], + disk_types: vec![String::new()], + idx_folder: String::new(), + white_list: vec![], + fix_jpg_orientation: false, + read_mode: ReadMode::Local, + cpu_profile: String::new(), + mem_profile: String::new(), + compaction_byte_per_second: 0, + maintenance_byte_per_second: 0, + file_size_limit_bytes: 0, + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(0), + inflight_download_data_timeout: std::time::Duration::from_secs(0), + has_slow_read: false, + read_buffer_size_mb: 4, + ldb_timeout: 0, + pprof: false, + metrics_port: 0, + metrics_ip: String::new(), + debug: false, + debug_port: 0, + ui_enabled: false, + jwt_signing_key: vec![], + jwt_signing_expires_seconds: 0, + jwt_read_signing_key: vec![], + jwt_read_signing_expires_seconds: 0, + https_cert_file: String::new(), + https_key_file: String::new(), + https_ca_file: String::new(), + https_client_enabled: false, + https_client_cert_file: String::new(), + https_client_key_file: String::new(), + https_client_ca_file: String::new(), + grpc_cert_file: String::new(), + grpc_key_file: String::new(), + grpc_ca_file: String::new(), + grpc_allowed_wildcard_domain: String::new(), + grpc_volume_allowed_common_names: vec![], + tls_policy: TlsPolicy::default(), + enable_write_queue: false, + security_file: String::new(), + } + } + + #[test] + fn test_cpu_profile_session_skips_when_disabled() { + let config = sample_config(); + assert!(CpuProfileSession::start(&config).unwrap().is_none()); + } + + #[test] + fn test_cpu_profile_session_skips_when_pprof_enabled() { + let mut config = sample_config(); + config.cpu_profile = "/tmp/cpu.pb".to_string(); + config.pprof = true; + assert!(CpuProfileSession::start(&config).unwrap().is_none()); + } +} diff --git a/seaweed-volume/src/server/request_id.rs b/seaweed-volume/src/server/request_id.rs new file mode 100644 index 000000000..f3e43c560 --- /dev/null +++ b/seaweed-volume/src/server/request_id.rs @@ -0,0 +1,137 @@ +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use hyper::http::{self, HeaderValue}; +use tonic::metadata::MetadataValue; +use tonic::{Request, Status}; +use tower::{Layer, Service}; + +tokio::task_local! { + static CURRENT_REQUEST_ID: String; +} + +#[derive(Clone, Debug, Default)] +pub struct GrpcRequestIdLayer; + +#[derive(Clone, Debug)] +pub struct GrpcRequestIdService { + inner: S, +} + +impl Layer for GrpcRequestIdLayer { + type Service = GrpcRequestIdService; + + fn layer(&self, inner: S) -> Self::Service { + GrpcRequestIdService { inner } + } +} + +impl Service> for GrpcRequestIdService +where + S: Service, Response = http::Response> + Send + 'static, + S::Future: Send + 'static, + B: Send + 'static, +{ + type Response = http::Response; + type Error = S::Error; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, mut request: http::Request) -> Self::Future { + let request_id = match request.headers().get("x-amz-request-id") { + Some(value) => match value.to_str() { + Ok(value) if !value.is_empty() => value.to_owned(), + _ => generate_grpc_request_id(), + }, + None => generate_grpc_request_id(), + }; + + if let Ok(value) = HeaderValue::from_str(&request_id) { + request.headers_mut().insert("x-amz-request-id", value); + } + + let future = self.inner.call(request); + + Box::pin(async move { + let mut response: http::Response = + scope_request_id(request_id.clone(), future).await?; + if let Ok(value) = HeaderValue::from_str(&request_id) { + response.headers_mut().insert("x-amz-request-id", value); + } + Ok(response) + }) + } +} + +pub async fn scope_request_id(request_id: String, future: F) -> T +where + F: Future, +{ + CURRENT_REQUEST_ID.scope(request_id, future).await +} + +pub fn current_request_id() -> Option { + CURRENT_REQUEST_ID.try_with(Clone::clone).ok() +} + +pub fn outgoing_request_id_interceptor(mut request: Request<()>) -> Result, Status> { + if let Some(request_id) = current_request_id() { + let value = MetadataValue::try_from(request_id.as_str()) + .map_err(|_| Status::internal("invalid scoped request id"))?; + request.metadata_mut().insert("x-amz-request-id", value); + } + Ok(request) +} + +pub fn generate_http_request_id() -> String { + use rand::Rng; + + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + let rand_val: u32 = rand::thread_rng().gen(); + format!("{:X}{:08X}", nanos, rand_val) +} + +fn generate_grpc_request_id() -> String { + uuid::Uuid::new_v4().to_string() +} + +#[cfg(test)] +mod tests { + use super::{current_request_id, outgoing_request_id_interceptor, scope_request_id}; + use tonic::Request; + + #[tokio::test] + async fn test_scope_request_id_exposes_current_value() { + let request_id = "req-123".to_string(); + let current = scope_request_id( + request_id.clone(), + async move { current_request_id().unwrap() }, + ) + .await; + assert_eq!(current, request_id); + } + + #[tokio::test] + async fn test_outgoing_request_id_interceptor_propagates_scope() { + let request = scope_request_id("req-456".to_string(), async move { + outgoing_request_id_interceptor(Request::new(())).unwrap() + }) + .await; + assert_eq!( + request + .metadata() + .get("x-amz-request-id") + .unwrap() + .to_str() + .unwrap(), + "req-456" + ); + } +} diff --git a/seaweed-volume/src/server/server_stats.rs b/seaweed-volume/src/server/server_stats.rs new file mode 100644 index 000000000..054b6d907 --- /dev/null +++ b/seaweed-volume/src/server/server_stats.rs @@ -0,0 +1,248 @@ +use chrono::{Datelike, Local, Timelike}; +use serde::Serialize; +use std::sync::{LazyLock, Mutex}; +use std::time::Instant; + +static START_TIME: LazyLock = LazyLock::new(Instant::now); +static SERVER_STATS: LazyLock = LazyLock::new(ServerStats::default); + +#[derive(Default)] +pub struct ServerStats { + inner: Mutex, +} + +#[derive(Default)] +struct ServerStatsInner { + requests: DurationCounter, + connections: DurationCounter, + assign_requests: DurationCounter, + read_requests: DurationCounter, + write_requests: DurationCounter, + delete_requests: DurationCounter, + bytes_in: DurationCounter, + bytes_out: DurationCounter, +} + +#[derive(Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct ServerStatsSnapshot { + pub requests: DurationCounterSnapshot, + pub connections: DurationCounterSnapshot, + pub assign_requests: DurationCounterSnapshot, + pub read_requests: DurationCounterSnapshot, + pub write_requests: DurationCounterSnapshot, + pub delete_requests: DurationCounterSnapshot, + pub bytes_in: DurationCounterSnapshot, + pub bytes_out: DurationCounterSnapshot, +} + +#[derive(Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct DurationCounterSnapshot { + pub minute_counter: RoundRobinCounterSnapshot, + pub hour_counter: RoundRobinCounterSnapshot, + pub day_counter: RoundRobinCounterSnapshot, + pub week_counter: RoundRobinCounterSnapshot, +} + +#[derive(Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct RoundRobinCounterSnapshot { + pub last_index: i32, + pub values: Vec, + pub counts: Vec, +} + +#[derive(Clone)] +struct DurationCounter { + minute_counter: RoundRobinCounter, + hour_counter: RoundRobinCounter, + day_counter: RoundRobinCounter, + week_counter: RoundRobinCounter, +} + +#[derive(Clone)] +struct RoundRobinCounter { + last_index: i32, + values: Vec, + counts: Vec, +} + +impl Default for DurationCounter { + fn default() -> Self { + Self { + minute_counter: RoundRobinCounter::new(60), + hour_counter: RoundRobinCounter::new(60), + day_counter: RoundRobinCounter::new(24), + week_counter: RoundRobinCounter::new(7), + } + } +} + +impl RoundRobinCounter { + fn new(slots: usize) -> Self { + Self { + last_index: -1, + values: vec![0; slots], + counts: vec![0; slots], + } + } + + fn add(&mut self, index: usize, val: i64) { + if index >= self.values.len() { + return; + } + while self.last_index != index as i32 { + self.last_index = (self.last_index + 1).rem_euclid(self.values.len() as i32); + self.values[self.last_index as usize] = 0; + self.counts[self.last_index as usize] = 0; + } + self.values[index] += val; + self.counts[index] += 1; + } + + fn snapshot(&self) -> RoundRobinCounterSnapshot { + RoundRobinCounterSnapshot { + last_index: self.last_index, + values: self.values.clone(), + counts: self.counts.clone(), + } + } +} + +impl DurationCounter { + fn add_now(&mut self, val: i64) { + let now = Local::now(); + self.minute_counter.add(now.second() as usize, val); + self.hour_counter.add(now.minute() as usize, val); + self.day_counter.add(now.hour() as usize, val); + self.week_counter + .add(now.weekday().num_days_from_sunday() as usize, val); + } + + fn snapshot(&self) -> DurationCounterSnapshot { + DurationCounterSnapshot { + minute_counter: self.minute_counter.snapshot(), + hour_counter: self.hour_counter.snapshot(), + day_counter: self.day_counter.snapshot(), + week_counter: self.week_counter.snapshot(), + } + } +} + +impl ServerStatsInner { + fn snapshot(&self) -> ServerStatsSnapshot { + ServerStatsSnapshot { + requests: self.requests.snapshot(), + connections: self.connections.snapshot(), + assign_requests: self.assign_requests.snapshot(), + read_requests: self.read_requests.snapshot(), + write_requests: self.write_requests.snapshot(), + delete_requests: self.delete_requests.snapshot(), + bytes_in: self.bytes_in.snapshot(), + bytes_out: self.bytes_out.snapshot(), + } + } +} + +impl ServerStats { + fn update(&self, update: F) + where + F: FnOnce(&mut ServerStatsInner), + { + let mut inner = self.inner.lock().unwrap(); + update(&mut inner); + } + + fn snapshot(&self) -> ServerStatsSnapshot { + self.inner.lock().unwrap().snapshot() + } +} + +impl RoundRobinCounterSnapshot { + pub fn to_list(&self) -> Vec { + if self.values.is_empty() { + return Vec::new(); + } + let mut ret = Vec::with_capacity(self.values.len()); + let mut index = self.last_index; + let mut step = self.values.len(); + while step > 0 { + step -= 1; + index += 1; + if index >= self.values.len() as i32 { + index = 0; + } + ret.push(self.values[index as usize]); + } + ret + } +} + +pub fn init_process_start() { + LazyLock::force(&START_TIME); + LazyLock::force(&SERVER_STATS); +} + +pub fn uptime_string() -> String { + let secs = START_TIME.elapsed().as_secs(); + let hours = secs / 3600; + let minutes = (secs % 3600) / 60; + let seconds = secs % 60; + let mut out = String::new(); + if hours > 0 { + out.push_str(&format!("{}h", hours)); + } + if hours > 0 || minutes > 0 { + out.push_str(&format!("{}m", minutes)); + } + out.push_str(&format!("{}s", seconds)); + out +} + +pub fn snapshot() -> ServerStatsSnapshot { + SERVER_STATS.snapshot() +} + +pub fn record_request_open() { + SERVER_STATS.update(|inner| inner.requests.add_now(1)); +} + +pub fn record_request_close() { + SERVER_STATS.update(|inner| inner.requests.add_now(-1)); +} + +pub fn record_connection_open() { + SERVER_STATS.update(|inner| inner.connections.add_now(1)); +} + +pub fn record_connection_close() { + SERVER_STATS.update(|inner| inner.connections.add_now(-1)); +} + +pub fn record_read_request() { + SERVER_STATS.update(|inner| inner.read_requests.add_now(1)); +} + +pub fn record_write_request() { + SERVER_STATS.update(|inner| inner.write_requests.add_now(1)); +} + +pub fn record_delete_request() { + SERVER_STATS.update(|inner| inner.delete_requests.add_now(1)); +} + +pub fn record_bytes_in(bytes: i64) { + SERVER_STATS.update(|inner| inner.bytes_in.add_now(bytes)); +} + +pub fn record_bytes_out(bytes: i64) { + SERVER_STATS.update(|inner| inner.bytes_out.add_now(bytes)); +} + +#[cfg(test)] +pub fn reset_for_tests() { + LazyLock::force(&START_TIME); + let mut inner = SERVER_STATS.inner.lock().unwrap(); + *inner = ServerStatsInner::default(); +} diff --git a/seaweed-volume/src/server/ui.rs b/seaweed-volume/src/server/ui.rs new file mode 100644 index 000000000..f1f830a56 --- /dev/null +++ b/seaweed-volume/src/server/ui.rs @@ -0,0 +1,507 @@ +use std::fmt::Write as _; + +use crate::server::server_stats; +use crate::server::volume_server::VolumeServerState; +use crate::storage::store::Store; + +pub struct EmbeddedAsset { + pub content_type: &'static str, + pub bytes: &'static [u8], +} + +struct UiDiskRow { + dir: String, + disk_type: String, + all: u64, + free: u64, + used: u64, +} + +struct UiVolumeRow { + id: u32, + collection: String, + disk_type: String, + size: u64, + file_count: i64, + delete_count: i64, + deleted_byte_count: u64, + ttl: String, + read_only: bool, + version: u32, + remote_storage_name: String, + remote_storage_key: String, +} + +struct UiEcShardRow { + shard_id: u8, + size: u64, +} + +struct UiEcVolumeRow { + volume_id: u32, + collection: String, + size: u64, + shards: Vec, + created_at: String, +} + +pub fn favicon_asset() -> EmbeddedAsset { + EmbeddedAsset { + content_type: "image/x-icon", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/favicon.ico" + )), + } +} + +pub fn lookup_static_asset(path: &str) -> Option { + let path = path.trim_start_matches('/'); + let asset = match path { + "bootstrap/3.3.1/css/bootstrap.min.css" => EmbeddedAsset { + content_type: "text/css; charset=utf-8", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/css/bootstrap.min.css" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.eot" => EmbeddedAsset { + content_type: "application/vnd.ms-fontobject", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.eot" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.svg" => EmbeddedAsset { + content_type: "image/svg+xml", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.svg" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.ttf" => EmbeddedAsset { + content_type: "font/ttf", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.ttf" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.woff" => EmbeddedAsset { + content_type: "font/woff", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.woff" + )), + }, + "images/folder.gif" => EmbeddedAsset { + content_type: "image/gif", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/images/folder.gif" + )), + }, + "javascript/jquery-3.6.0.min.js" => EmbeddedAsset { + content_type: "application/javascript; charset=utf-8", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/javascript/jquery-3.6.0.min.js" + )), + }, + "javascript/jquery-sparklines/2.1.2/jquery.sparkline.min.js" => EmbeddedAsset { + content_type: "application/javascript; charset=utf-8", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/javascript/jquery-sparklines/2.1.2/jquery.sparkline.min.js" + )), + }, + "seaweed50x50.png" => EmbeddedAsset { + content_type: "image/png", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/seaweed50x50.png" + )), + }, + _ => return None, + }; + Some(asset) +} + +pub fn render_volume_server_html(state: &VolumeServerState) -> String { + let counters = server_stats::snapshot(); + let (disk_rows, volume_rows, remote_volume_rows, ec_volume_rows) = { + let store = state.store.read().unwrap(); + collect_ui_data(&store) + }; + + let masters = if state.master_urls.is_empty() { + "[]".to_string() + } else { + format!("[{}]", state.master_urls.join(" ")) + }; + let uptime = server_stats::uptime_string(); + let read_week = join_i64(&counters.read_requests.week_counter.to_list()); + let read_day = join_i64(&counters.read_requests.day_counter.to_list()); + let read_hour = join_i64(&counters.read_requests.hour_counter.to_list()); + let read_minute = join_i64(&counters.read_requests.minute_counter.to_list()); + + let mut disk_rows_html = String::new(); + for disk in &disk_rows { + let _ = write!( + disk_rows_html, + "{}{}{}{}{:.2}%", + escape_html(&disk.dir), + escape_html(&disk.disk_type), + bytes_to_human_readable(disk.all), + bytes_to_human_readable(disk.free), + percent_from(disk.all, disk.used), + ); + } + + let mut volume_rows_html = String::new(); + for volume in &volume_rows { + let _ = write!( + volume_rows_html, + "{}{}{}{}{}{} / {}{}{}{}", + volume.id, + escape_html(&volume.collection), + escape_html(&volume.disk_type), + bytes_to_human_readable(volume.size), + volume.file_count, + volume.delete_count, + bytes_to_human_readable(volume.deleted_byte_count), + escape_html(&volume.ttl), + volume.read_only, + volume.version, + ); + } + + let remote_section = if remote_volume_rows.is_empty() { + String::new() + } else { + let mut remote_rows_html = String::new(); + for volume in &remote_volume_rows { + let _ = write!( + remote_rows_html, + "{}{}{}{}{} / {}{}{}", + volume.id, + escape_html(&volume.collection), + bytes_to_human_readable(volume.size), + volume.file_count, + volume.delete_count, + bytes_to_human_readable(volume.deleted_byte_count), + escape_html(&volume.remote_storage_name), + escape_html(&volume.remote_storage_key), + ); + } + format!( + r#"
+

Remote Volumes

+ + + + + + + + + + + + + {} +
IdCollectionSizeFilesTrashRemoteKey
+
"#, + remote_rows_html + ) + }; + + let ec_section = if ec_volume_rows.is_empty() { + String::new() + } else { + let mut ec_rows_html = String::new(); + for ec in &ec_volume_rows { + let mut shard_labels = String::new(); + for shard in &ec.shards { + let _ = write!( + shard_labels, + "{}: {}", + shard.shard_id, + bytes_to_human_readable(shard.size) + ); + } + let _ = write!( + ec_rows_html, + "{}{}{}{}{}", + ec.volume_id, + escape_html(&ec.collection), + bytes_to_human_readable(ec.size), + shard_labels, + escape_html(&ec.created_at), + ); + } + format!( + r#"
+

Erasure Coding Shards

+ + + + + + + + + + + {} +
IdCollectionTotal SizeShard DetailsCreatedAt
+
"#, + ec_rows_html + ) + }; + + format!( + r#" + + + SeaweedFS {version} + + + + + + + +
+ + +
+
+

Disk Stats

+ + + + + + + + + + + {disk_rows_html} +
PathDiskTotalFreeUsage
+
+ +
+

System Stats

+ + + + + + + +
Masters{masters}
Weekly # ReadRequests{read_week}
Daily # ReadRequests{read_day}
Hourly # ReadRequests{read_hour}
Last Minute # ReadRequests{read_minute}
Up Time{uptime}
+
+
+ +
+

Volumes

+ + + + + + + + + + + + + + + {volume_rows_html} +
IdCollectionDiskData SizeFilesTrashTTLReadOnlyVersion
+
+ + {remote_section} + {ec_section} +
+ +"#, + version = escape_html(crate::version::version()), + disk_rows_html = disk_rows_html, + masters = escape_html(&masters), + read_week = read_week, + read_day = read_day, + read_hour = read_hour, + read_minute = read_minute, + uptime = escape_html(&uptime), + volume_rows_html = volume_rows_html, + remote_section = remote_section, + ec_section = ec_section, + ) +} + +fn collect_ui_data( + store: &Store, +) -> ( + Vec, + Vec, + Vec, + Vec, +) { + let mut disk_rows = Vec::new(); + let mut volumes = Vec::new(); + let mut remote_volumes = Vec::new(); + let mut ec_volumes = Vec::new(); + + for loc in &store.locations { + let dir = absolute_display_path(&loc.directory); + let (all, free) = crate::storage::disk_location::get_disk_stats(&dir); + disk_rows.push(UiDiskRow { + dir, + disk_type: loc.disk_type.to_string(), + all, + free, + used: all.saturating_sub(free), + }); + + for (_, volume) in loc.volumes() { + let (remote_storage_name, remote_storage_key) = volume.remote_storage_name_key(); + let row = UiVolumeRow { + id: volume.id.0, + collection: volume.collection.clone(), + disk_type: loc.disk_type.to_string(), + size: volume.content_size(), + file_count: volume.file_count(), + delete_count: volume.deleted_count(), + deleted_byte_count: volume.deleted_size(), + ttl: volume.super_block.ttl.to_string(), + read_only: volume.is_read_only(), + version: volume.version().0 as u32, + remote_storage_name, + remote_storage_key, + }; + if row.remote_storage_name.is_empty() { + volumes.push(row); + } else { + remote_volumes.push(row); + } + } + + for (_, ec_volume) in loc.ec_volumes() { + let mut shards = Vec::new(); + let mut total_size = 0u64; + let mut created_at = String::from("-"); + for shard in ec_volume.shards.iter().flatten() { + let shard_size = shard.file_size().max(0) as u64; + total_size = total_size.saturating_add(shard_size); + shards.push(UiEcShardRow { + shard_id: shard.shard_id, + size: shard_size, + }); + if created_at == "-" { + if let Ok(metadata) = std::fs::metadata(shard.file_name()) { + if let Ok(modified) = metadata.modified() { + let ts: chrono::DateTime = modified.into(); + created_at = ts.format("%Y-%m-%d %H:%M").to_string(); + } + } + } + } + let preferred_size = ec_volume.dat_file_size.max(0) as u64; + ec_volumes.push(UiEcVolumeRow { + volume_id: ec_volume.volume_id.0, + collection: ec_volume.collection.clone(), + size: preferred_size.max(total_size), + shards, + created_at, + }); + } + } + + disk_rows.sort_by(|left, right| left.dir.cmp(&right.dir)); + volumes.sort_by_key(|row| row.id); + remote_volumes.sort_by_key(|row| row.id); + ec_volumes.sort_by_key(|row| row.volume_id); + + (disk_rows, volumes, remote_volumes, ec_volumes) +} + +fn absolute_display_path(path: &str) -> String { + let p = std::path::Path::new(path); + if p.is_absolute() { + return path.to_string(); + } + std::env::current_dir() + .map(|cwd| cwd.join(p).to_string_lossy().to_string()) + .unwrap_or_else(|_| path.to_string()) +} + +fn join_i64(values: &[i64]) -> String { + values + .iter() + .map(std::string::ToString::to_string) + .collect::>() + .join(",") +} + +fn percent_from(total: u64, part: u64) -> f64 { + if total == 0 { + return 0.0; + } + (part as f64 / total as f64) * 100.0 +} + +fn bytes_to_human_readable(bytes: u64) -> String { + const UNIT: u64 = 1024; + if bytes < UNIT { + return format!("{} B", bytes); + } + + let mut div = UNIT; + let mut exp = 0usize; + let mut n = bytes / UNIT; + while n >= UNIT { + div *= UNIT; + n /= UNIT; + exp += 1; + } + + format!( + "{:.2} {}iB", + bytes as f64 / div as f64, + ["K", "M", "G", "T", "P", "E"][exp] + ) +} + +fn escape_html(input: &str) -> String { + input + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) +} diff --git a/seaweed-volume/src/server/volume_server.rs b/seaweed-volume/src/server/volume_server.rs new file mode 100644 index 000000000..90436dc01 --- /dev/null +++ b/seaweed-volume/src/server/volume_server.rs @@ -0,0 +1,394 @@ +//! VolumeServer: the main HTTP server for volume operations. +//! +//! Routes: +//! GET/HEAD /{vid},{fid} — read a file +//! POST/PUT /{vid},{fid} — write a file +//! DELETE /{vid},{fid} — delete a file +//! GET /status — server status +//! GET /healthz — health check +//! +//! Matches Go's server/volume_server.go. + +use std::net::SocketAddr; +use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU32, Ordering}; +use std::sync::{Arc, RwLock}; + +use axum::{ + extract::{connect_info::ConnectInfo, Request, State}, + http::{header, HeaderValue, Method, StatusCode}, + middleware::{self, Next}, + response::{IntoResponse, Response}, + routing::{any, get}, + Router, +}; + +use crate::config::ReadMode; +use crate::security::Guard; +use crate::storage::store::Store; + +use super::grpc_client::OutgoingGrpcTlsConfig; +use super::handlers; +use super::write_queue::WriteQueue; + +#[derive(Clone, Debug, Default)] +pub struct RuntimeMetricsConfig { + pub push_gateway: crate::metrics::PushGatewayConfig, +} + +/// Shared state for the volume server. +pub struct VolumeServerState { + pub store: RwLock, + pub guard: RwLock, + pub is_stopping: RwLock, + /// Maintenance mode flag. + pub maintenance: AtomicBool, + /// State version — incremented on each SetState call. + pub state_version: AtomicU32, + /// Throttling: concurrent upload/download limits (in bytes, 0 = disabled). + pub concurrent_upload_limit: i64, + pub concurrent_download_limit: i64, + pub inflight_upload_data_timeout: std::time::Duration, + pub inflight_download_data_timeout: std::time::Duration, + /// Current in-flight upload/download bytes. + pub inflight_upload_bytes: AtomicI64, + pub inflight_download_bytes: AtomicI64, + /// Notify waiters when inflight bytes decrease. + pub upload_notify: tokio::sync::Notify, + pub download_notify: tokio::sync::Notify, + /// Data center name from config. + pub data_center: String, + /// Rack name from config. + pub rack: String, + /// File size limit in bytes (0 = no limit). + pub file_size_limit_bytes: i64, + /// Default IO rate limit for maintenance copy/replication work. + pub maintenance_byte_per_second: i64, + /// Whether the server is connected to master (heartbeat active). + pub is_heartbeating: AtomicBool, + /// Whether master addresses are configured. + pub has_master: bool, + /// Seconds to wait before shutting down servers (graceful drain). + pub pre_stop_seconds: u32, + /// Notify heartbeat to send an immediate update when volume state changes. + pub volume_state_notify: tokio::sync::Notify, + /// Optional batched write queue for improved throughput under load. + pub write_queue: std::sync::OnceLock, + /// Registry of S3 tier backends for tiered storage operations. + pub s3_tier_registry: std::sync::RwLock, + /// Read mode: local, proxy, or redirect for non-local volumes. + pub read_mode: ReadMode, + /// First master address for volume lookups (e.g., "localhost:9333"). + pub master_url: String, + /// Seed master addresses for UI rendering. + pub master_urls: Vec, + /// This server's own address (ip:port) for filtering self from lookup results. + pub self_url: String, + /// HTTP client for proxy requests and master lookups. + pub http_client: reqwest::Client, + /// Scheme used for outgoing master and peer HTTP requests ("http" or "https"). + pub outgoing_http_scheme: String, + /// Optional client TLS material for outgoing gRPC connections. + pub outgoing_grpc_tls: Option, + /// Metrics push settings learned from master heartbeat responses. + pub metrics_runtime: std::sync::RwLock, + pub metrics_notify: tokio::sync::Notify, + /// Whether JPEG uploads should be normalized using EXIF orientation. + pub fix_jpg_orientation: bool, + /// Read tuning flags for large-file streaming. + pub has_slow_read: bool, + pub read_buffer_size_bytes: usize, + /// Path to security.toml — stored for SIGHUP reload. + pub security_file: String, + /// Original CLI whitelist entries — stored for SIGHUP reload. + pub cli_white_list: Vec, + /// Path to state.pb file for persisting VolumeServerState across restarts. + pub state_file_path: String, +} + +impl VolumeServerState { + /// Check if the server is in maintenance mode; return gRPC error if so. + pub fn check_maintenance(&self) -> Result<(), tonic::Status> { + if self.maintenance.load(Ordering::Relaxed) { + let id = self.store.read().unwrap().id.clone(); + return Err(tonic::Status::unavailable(format!( + "volume server {} is in maintenance mode", + id + ))); + } + Ok(()) + } +} + +pub fn build_metrics_router() -> Router { + Router::new().route("/metrics", get(handlers::metrics_handler)) +} + +pub fn normalize_outgoing_http_url(scheme: &str, raw_target: &str) -> Result { + if raw_target.starts_with("http://") || raw_target.starts_with("https://") { + let mut url = reqwest::Url::parse(raw_target) + .map_err(|e| format!("invalid url {}: {}", raw_target, e))?; + url.set_scheme(scheme) + .map_err(|_| format!("invalid scheme {}", scheme))?; + return Ok(url.to_string()); + } + Ok(format!("{}://{}", scheme, raw_target)) +} + +fn request_remote_addr(request: &Request) -> Option { + request + .extensions() + .get::>() + .map(|info| info.0) +} + +fn request_is_whitelisted(state: &VolumeServerState, request: &Request) -> bool { + request_remote_addr(request) + .map(|remote_addr| { + state + .guard + .read() + .unwrap() + .check_whitelist(&remote_addr.to_string()) + }) + .unwrap_or(true) +} + +/// Middleware: set Server header, echo x-amz-request-id, set CORS if Origin present. +async fn common_headers_middleware(request: Request, next: Next) -> Response { + let origin = request.headers().get("origin").cloned(); + let request_id = super::request_id::generate_http_request_id(); + + let mut response = + super::request_id::scope_request_id( + request_id.clone(), + async move { next.run(request).await }, + ) + .await; + + let headers = response.headers_mut(); + if let Ok(val) = HeaderValue::from_str(crate::version::server_header()) { + headers.insert("Server", val); + } + + if let Ok(val) = HeaderValue::from_str(&request_id) { + headers.insert("X-Request-Id", val.clone()); + headers.insert("x-amz-request-id", val); + } + + if origin.is_some() { + headers.insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); + headers.insert( + "Access-Control-Allow-Credentials", + HeaderValue::from_static("true"), + ); + } + + response +} + +/// Admin store handler — dispatches based on HTTP method. +/// Matches Go's privateStoreHandler: GET/HEAD → read, POST/PUT → write, +/// DELETE → delete, OPTIONS → CORS headers, anything else → 400. +async fn admin_store_handler(state: State>, request: Request) -> Response { + let start = std::time::Instant::now(); + let method = request.method().clone(); + let mut method_str = method.as_str().to_string(); + let request_bytes = request + .headers() + .get(header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + .unwrap_or(0); + super::server_stats::record_request_open(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .inc(); + let whitelist_rejected = matches!(method, Method::POST | Method::PUT | Method::DELETE) + && !request_is_whitelisted(&state, &request); + let response = match method.clone() { + _ if whitelist_rejected => StatusCode::UNAUTHORIZED.into_response(), + Method::GET | Method::HEAD => { + super::server_stats::record_read_request(); + handlers::get_or_head_handler_from_request(state, request).await + } + Method::POST | Method::PUT => { + super::server_stats::record_write_request(); + if request_bytes > 0 { + super::server_stats::record_bytes_in(request_bytes); + } + handlers::post_handler(state, request).await + } + Method::DELETE => { + super::server_stats::record_delete_request(); + handlers::delete_handler(state, request).await + } + Method::OPTIONS => { + super::server_stats::record_read_request(); + admin_options_response() + } + _ => { + let method_name = request.method().to_string(); + let query = request.uri().query().map(|q| q.to_string()); + method_str = "INVALID".to_string(); + handlers::json_error_with_query( + StatusCode::BAD_REQUEST, + format!("unsupported method {}", method_name), + query.as_deref(), + ) + } + }; + if method == Method::GET { + if let Some(response_bytes) = response + .headers() + .get(header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + { + super::server_stats::record_bytes_out(response_bytes); + } + } + super::server_stats::record_request_close(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .dec(); + crate::metrics::REQUEST_COUNTER + .with_label_values(&[&method_str, response.status().as_str()]) + .inc(); + crate::metrics::REQUEST_DURATION + .with_label_values(&[&method_str]) + .observe(start.elapsed().as_secs_f64()); + response +} + +/// Public store handler — dispatches based on HTTP method. +/// Matches Go's publicReadOnlyHandler: GET/HEAD → read, OPTIONS → CORS, +/// anything else → 200 (passthrough no-op). +async fn public_store_handler(state: State>, request: Request) -> Response { + let start = std::time::Instant::now(); + let method = request.method().clone(); + let method_str = method.as_str().to_string(); + super::server_stats::record_request_open(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .inc(); + let response = match method.clone() { + Method::GET | Method::HEAD => { + super::server_stats::record_read_request(); + handlers::get_or_head_handler_from_request(state, request).await + } + Method::OPTIONS => { + super::server_stats::record_read_request(); + public_options_response() + } + _ => StatusCode::OK.into_response(), + }; + if method == Method::GET { + if let Some(response_bytes) = response + .headers() + .get(header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + { + super::server_stats::record_bytes_out(response_bytes); + } + } + super::server_stats::record_request_close(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .dec(); + crate::metrics::REQUEST_COUNTER + .with_label_values(&[&method_str, response.status().as_str()]) + .inc(); + crate::metrics::REQUEST_DURATION + .with_label_values(&[&method_str]) + .observe(start.elapsed().as_secs_f64()); + response +} + +/// Build OPTIONS response for admin port. +fn admin_options_response() -> Response { + let mut response = StatusCode::OK.into_response(); + let headers = response.headers_mut(); + headers.insert( + "Access-Control-Allow-Methods", + HeaderValue::from_static("PUT, POST, GET, DELETE, OPTIONS"), + ); + headers.insert( + "Access-Control-Allow-Headers", + HeaderValue::from_static("*"), + ); + response +} + +/// Build OPTIONS response for public port. +fn public_options_response() -> Response { + let mut response = StatusCode::OK.into_response(); + let headers = response.headers_mut(); + headers.insert( + "Access-Control-Allow-Methods", + HeaderValue::from_static("GET, OPTIONS"), + ); + headers.insert( + "Access-Control-Allow-Headers", + HeaderValue::from_static("*"), + ); + response +} + +/// Build the admin (private) HTTP router — supports all operations. +/// UI route is only registered when no signing keys are configured, +/// matching Go's `if signingKey == "" || enableUiAccess` check. +pub fn build_admin_router(state: Arc) -> Router { + let guard = state.guard.read().unwrap(); + // This helper can only derive the default Go behavior from the guard state: + // UI stays enabled when the write signing key is empty. The explicit + // `access.ui` override is handled by `build_admin_router_with_ui(...)`. + let ui_enabled = guard.signing_key.0.is_empty(); + drop(guard); + build_admin_router_with_ui(state, ui_enabled) +} + +/// Build the admin router with an explicit UI exposure flag. +pub fn build_admin_router_with_ui(state: Arc, ui_enabled: bool) -> Router { + let mut router = Router::new() + .route("/status", get(handlers::status_handler)) + .route("/healthz", get(handlers::healthz_handler)) + .route("/favicon.ico", get(handlers::favicon_handler)) + .route( + "/seaweedfsstatic/*path", + get(handlers::static_asset_handler), + ) + .route("/", any(admin_store_handler)) + .route("/:path", any(admin_store_handler)) + .route("/:vid/:fid", any(admin_store_handler)) + .route("/:vid/:fid/:filename", any(admin_store_handler)) + .fallback(admin_store_handler); + if ui_enabled { + // Note: /stats/* endpoints are commented out in Go's volume_server.go (L130-134). + // Only the UI endpoint is registered when UI access is enabled. + router = router.route("/ui/index.html", get(handlers::ui_handler)); + } + router + .layer(middleware::from_fn(common_headers_middleware)) + .with_state(state) +} + +/// Build the public (read-only) HTTP router — only GET/HEAD. +pub fn build_public_router(state: Arc) -> Router { + Router::new() + .route("/favicon.ico", get(handlers::favicon_handler)) + .route( + "/seaweedfsstatic/*path", + get(handlers::static_asset_handler), + ) + .route("/", any(public_store_handler)) + .route("/:path", any(public_store_handler)) + .route("/:vid/:fid", any(public_store_handler)) + .route("/:vid/:fid/:filename", any(public_store_handler)) + .fallback(public_store_handler) + .layer(middleware::from_fn(common_headers_middleware)) + .with_state(state) +} diff --git a/seaweed-volume/src/server/write_queue.rs b/seaweed-volume/src/server/write_queue.rs new file mode 100644 index 000000000..112ae5684 --- /dev/null +++ b/seaweed-volume/src/server/write_queue.rs @@ -0,0 +1,330 @@ +//! Async batched write processing for the volume server. +//! +//! Instead of each upload handler directly calling `write_needle` and syncing, +//! writes are submitted to a queue. A background worker drains the queue in +//! batches (up to 128 entries), groups them by volume ID, processes them +//! together, and syncs once per volume for the entire batch. + +use std::sync::Arc; + +use tokio::sync::{mpsc, oneshot}; +use tracing::debug; + +use crate::storage::needle::needle::Needle; +use crate::storage::types::{Size, VolumeId}; +use crate::storage::volume::VolumeError; + +use super::volume_server::VolumeServerState; + +/// Result of a single write operation: (offset, size, is_unchanged). +pub type WriteResult = Result<(u64, Size, bool), VolumeError>; + +/// A request to write a needle, submitted to the write queue. +pub struct WriteRequest { + pub volume_id: VolumeId, + pub needle: Needle, + pub response_tx: oneshot::Sender, +} + +/// Maximum number of write requests to batch together. +const MAX_BATCH_SIZE: usize = 128; + +/// Maximum bytes to accumulate per batch before breaking (matches Go's 4MB limit). +/// This prevents large writes from accumulating unbounded latency. +const MAX_BATCH_BYTES: usize = 4 * 1024 * 1024; + +/// Handle for submitting write requests to the background worker. +#[derive(Clone)] +pub struct WriteQueue { + tx: mpsc::Sender, +} + +impl WriteQueue { + /// Create a new write queue and spawn the background worker. + /// + /// `capacity` controls the channel buffer size (backpressure kicks in when full). + /// The worker holds a reference to `state` for accessing the store. + pub fn new(state: Arc, capacity: usize) -> Self { + let (tx, rx) = mpsc::channel(capacity); + let worker = WriteQueueWorker { rx, state }; + tokio::spawn(worker.run()); + WriteQueue { tx } + } + + /// Submit a write request and wait for the result. + /// + /// Returns `Err` if the worker has shut down or the response channel was dropped. + pub async fn submit(&self, volume_id: VolumeId, needle: Needle) -> WriteResult { + let (response_tx, response_rx) = oneshot::channel(); + let request = WriteRequest { + volume_id, + needle, + response_tx, + }; + + // Send to queue; this awaits if the channel is full (backpressure). + if self.tx.send(request).await.is_err() { + return Err(VolumeError::Io(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "write queue worker has shut down", + ))); + } + + // Wait for the worker to process our request. + match response_rx.await { + Ok(result) => result, + Err(_) => Err(VolumeError::Io(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "write queue worker dropped response channel", + ))), + } + } +} + +/// Background worker that drains write requests and processes them in batches. +struct WriteQueueWorker { + rx: mpsc::Receiver, + state: Arc, +} + +impl WriteQueueWorker { + async fn run(mut self) { + debug!("write queue worker started"); + + loop { + // Wait for the first request (blocks until one arrives or channel closes). + let first = match self.rx.recv().await { + Some(req) => req, + None => { + debug!("write queue channel closed, worker exiting"); + return; + } + }; + + // Drain as many additional requests as available, up to MAX_BATCH_SIZE + // or MAX_BATCH_BYTES (matches Go: 128 requests or 4MB, whichever comes first). + let mut batch = Vec::with_capacity(MAX_BATCH_SIZE); + let mut batch_bytes: usize = first.needle.data.len(); + batch.push(first); + + while batch.len() < MAX_BATCH_SIZE && batch_bytes < MAX_BATCH_BYTES { + match self.rx.try_recv() { + Ok(req) => { + batch_bytes += req.needle.data.len(); + batch.push(req); + } + Err(_) => break, + } + } + + let batch_size = batch.len(); + debug!("processing write batch of {} requests", batch_size); + + // Process the batch in spawn_blocking since write_needle does file I/O. + let state = self.state.clone(); + let _ = tokio::task::spawn_blocking(move || { + process_batch(state, batch); + }) + .await; + } + } +} + +/// Process a batch of write requests, grouped by volume ID. +/// +/// Groups writes by volume to minimize the number of store lock acquisitions, +/// then sends results back via each request's oneshot channel. +fn process_batch(state: Arc, batch: Vec) { + // Group requests by volume ID for efficient processing. + // We use a Vec of (VolumeId, Vec<(Needle, Sender)>) to preserve order + // and avoid requiring Hash on VolumeId. + let mut groups: Vec<(VolumeId, Vec<(Needle, oneshot::Sender)>)> = Vec::new(); + + for req in batch { + let vid = req.volume_id; + if let Some(group) = groups.iter_mut().find(|(v, _)| *v == vid) { + group.1.push((req.needle, req.response_tx)); + } else { + groups.push((vid, vec![(req.needle, req.response_tx)])); + } + } + + // Process each volume group under a single store lock. + let mut store = state.store.write().unwrap(); + + for (vid, entries) in groups { + for (mut needle, response_tx) in entries { + let result = store.write_volume_needle(vid, &mut needle); + // Send result back; ignore error if receiver dropped. + let _ = response_tx.send(result); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::types::VolumeId; + + /// Helper to create a minimal VolumeServerState for testing. + fn make_test_state() -> Arc { + use crate::security::{Guard, SigningKey}; + use crate::server::volume_server::RuntimeMetricsConfig; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::store::Store; + use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU32}; + use std::sync::RwLock; + + let store = Store::new(NeedleMapKind::InMemory); + let guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0); + + Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(guard), + is_stopping: RwLock::new(false), + maintenance: AtomicBool::new(false), + state_version: AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::ZERO, + inflight_download_data_timeout: std::time::Duration::ZERO, + inflight_upload_bytes: AtomicI64::new(0), + inflight_download_bytes: AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: AtomicBool::new(false), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + crate::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: crate::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new(RuntimeMetricsConfig::default()), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: true, + read_buffer_size_bytes: 4 * 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }) + } + + #[tokio::test] + async fn test_write_queue_submit_no_volume() { + // Submit a write to a non-existent volume -- should return VolumeError::NotFound. + let state = make_test_state(); + let queue = WriteQueue::new(state, MAX_BATCH_SIZE); + + let needle = Needle { + id: 1.into(), + cookie: 0x12345678.into(), + data: vec![1, 2, 3], + data_size: 3, + ..Needle::default() + }; + + let result = queue.submit(VolumeId(999), needle).await; + assert!(result.is_err()); + match result { + Err(VolumeError::NotFound) => {} // expected + other => panic!("expected NotFound, got {:?}", other), + } + } + + #[tokio::test] + async fn test_write_queue_concurrent_submissions() { + // Submit multiple concurrent writes -- all should complete (with errors since no volume). + let state = make_test_state(); + let queue = WriteQueue::new(state, MAX_BATCH_SIZE); + + let mut handles = Vec::new(); + for i in 0..10u64 { + let q = queue.clone(); + handles.push(tokio::spawn(async move { + let needle = Needle { + id: i.into(), + cookie: 0xABCD.into(), + data: vec![i as u8; 10], + data_size: 10, + ..Needle::default() + }; + q.submit(VolumeId(1), needle).await + })); + } + + for handle in handles { + let result = handle.await.unwrap(); + // All should fail with NotFound since there's no volume 1 + assert!(matches!(result, Err(VolumeError::NotFound))); + } + } + + #[tokio::test] + async fn test_write_queue_batching() { + // Verify that many concurrent writes get processed (testing the batching path). + let state = make_test_state(); + let queue = WriteQueue::new(state, MAX_BATCH_SIZE); + + // Submit MAX_BATCH_SIZE requests concurrently + let mut handles = Vec::new(); + for i in 0..MAX_BATCH_SIZE as u64 { + let q = queue.clone(); + handles.push(tokio::spawn(async move { + let needle = Needle { + id: i.into(), + cookie: 0x1111.into(), + data: vec![0u8; 4], + data_size: 4, + ..Needle::default() + }; + q.submit(VolumeId(42), needle).await + })); + } + + let mut results = Vec::new(); + for handle in handles { + results.push(handle.await.unwrap()); + } + + // All should complete (with NotFound errors since no volume exists) + assert_eq!(results.len(), MAX_BATCH_SIZE); + for r in results { + assert!(matches!(r, Err(VolumeError::NotFound))); + } + } + + #[tokio::test] + async fn test_write_queue_dropped_sender() { + // When the queue is dropped, subsequent submits should fail gracefully. + let state = make_test_state(); + let queue = WriteQueue::new(state, 1); + + // Clone then drop the original -- the worker keeps running via its rx handle. + let queue2 = queue.clone(); + drop(queue); + + // This should still work since the worker is alive. + let needle = Needle { + id: 1.into(), + cookie: 0.into(), + data: vec![], + data_size: 0, + ..Needle::default() + }; + let result = queue2.submit(VolumeId(1), needle).await; + assert!(result.is_err()); // NotFound is fine -- the point is it doesn't panic + } +} diff --git a/seaweed-volume/src/storage/disk_location.rs b/seaweed-volume/src/storage/disk_location.rs new file mode 100644 index 000000000..b336d0dd4 --- /dev/null +++ b/seaweed-volume/src/storage/disk_location.rs @@ -0,0 +1,951 @@ +//! DiskLocation: manages volumes on a single disk/directory. +//! +//! Each DiskLocation represents one storage directory containing .dat + .idx files. +//! A Store contains one or more DiskLocations (one per configured directory). +//! Matches Go's storage/disk_location.go. + +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::io; +use std::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, Ordering}; +use std::sync::Arc; + +use tracing::{info, warn}; + +use crate::config::MinFreeSpace; +use crate::storage::erasure_coding::ec_shard::{ + EcVolumeShard, DATA_SHARDS_COUNT, ERASURE_CODING_LARGE_BLOCK_SIZE, + ERASURE_CODING_SMALL_BLOCK_SIZE, +}; +use crate::storage::erasure_coding::ec_volume::EcVolume; +use crate::storage::needle_map::NeedleMapKind; +use crate::storage::super_block::ReplicaPlacement; +use crate::storage::types::*; +use crate::storage::volume::{remove_volume_files, volume_file_name, Volume, VolumeError}; + +/// A single disk location managing volumes in one directory. +pub struct DiskLocation { + pub directory: String, + pub idx_directory: String, + pub directory_uuid: String, + pub disk_type: DiskType, + pub tags: Vec, + pub max_volume_count: AtomicI32, + pub original_max_volume_count: i32, + volumes: HashMap, + ec_volumes: HashMap, + pub is_disk_space_low: Arc, + pub available_space: AtomicU64, + pub min_free_space: MinFreeSpace, +} + +impl DiskLocation { + const UUID_FILE_NAME: &'static str = "vol_dir.uuid"; + + pub fn new( + directory: &str, + idx_directory: &str, + max_volume_count: i32, + disk_type: DiskType, + min_free_space: MinFreeSpace, + tags: Vec, + ) -> io::Result { + fs::create_dir_all(directory)?; + + let idx_dir = if idx_directory.is_empty() { + directory.to_string() + } else { + fs::create_dir_all(idx_directory)?; + idx_directory.to_string() + }; + let directory_uuid = Self::generate_directory_uuid(directory)?; + + Ok(DiskLocation { + directory: directory.to_string(), + idx_directory: idx_dir, + directory_uuid, + disk_type, + tags, + max_volume_count: AtomicI32::new(max_volume_count), + original_max_volume_count: max_volume_count, + volumes: HashMap::new(), + ec_volumes: HashMap::new(), + is_disk_space_low: Arc::new(AtomicBool::new(false)), + available_space: AtomicU64::new(0), + min_free_space, + }) + } + + fn generate_directory_uuid(directory: &str) -> io::Result { + let path = std::path::Path::new(directory).join(Self::UUID_FILE_NAME); + if path.exists() { + let existing = fs::read_to_string(&path)?; + if !existing.trim().is_empty() { + return Ok(existing); + } + } + + let dir_uuid = uuid::Uuid::new_v4().to_string(); + fs::write(path, &dir_uuid)?; + Ok(dir_uuid) + } + + // ---- Volume management ---- + + /// Load existing volumes from the directory. + /// + /// Matches Go's `loadExistingVolume`: checks for incomplete volumes (.note file), + /// validates EC shards before skipping .dat loading, and cleans up stale + /// compaction temp files (.cpd/.cpx). + pub fn load_existing_volumes(&mut self, needle_map_kind: NeedleMapKind) -> io::Result<()> { + // Ensure directory exists + fs::create_dir_all(&self.directory)?; + if self.directory != self.idx_directory { + fs::create_dir_all(&self.idx_directory)?; + } + + // Scan for .dat files + let entries = fs::read_dir(&self.directory)?; + let mut dat_files: Vec<(String, VolumeId)> = Vec::new(); + let mut seen = HashSet::new(); + + for entry in entries { + let entry = entry?; + let name = entry.file_name().into_string().unwrap_or_default(); + if let Some((collection, vid)) = parse_volume_filename(&name) { + if seen.insert((collection.clone(), vid)) { + dat_files.push((collection, vid)); + } + } + } + + for (collection, vid) in dat_files { + let volume_name = volume_file_name(&self.directory, &collection, vid); + let idx_name = volume_file_name(&self.idx_directory, &collection, vid); + + // Check for incomplete volume (.note file means a VolumeCopy was interrupted) + let note_path = format!("{}.note", volume_name); + if std::path::Path::new(¬e_path).exists() { + let note = fs::read_to_string(¬e_path).unwrap_or_default(); + warn!( + volume_id = vid.0, + "volume was not completed: {}, removing files", note + ); + remove_volume_files(&volume_name); + remove_volume_files(&idx_name); + continue; + } + + // If valid EC shards exist (.ecx file present), skip loading .dat + let ecx_path = format!("{}.ecx", idx_name); + let ecx_exists = if std::path::Path::new(&ecx_path).exists() { + true + } else if self.idx_directory != self.directory { + // .ecx may have been created before -dir.idx was configured + let fallback = format!("{}.ecx", volume_name); + std::path::Path::new(&fallback).exists() + } else { + false + }; + if ecx_exists { + if self.validate_ec_volume(&collection, vid) { + // Valid EC volume — don't load .dat + continue; + } else { + warn!( + volume_id = vid.0, + "EC volume validation failed, removing incomplete EC files" + ); + self.remove_ec_volume_files(&collection, vid); + // Fall through to load .dat file + } + } + + // Clean up stale compaction temp files + let cpd_path = format!("{}.cpd", volume_name); + let cpx_path = format!("{}.cpx", idx_name); + if std::path::Path::new(&cpd_path).exists() { + info!(volume_id = vid.0, "removing stale compaction file .cpd"); + let _ = fs::remove_file(&cpd_path); + } + if std::path::Path::new(&cpx_path).exists() { + info!(volume_id = vid.0, "removing stale compaction file .cpx"); + let _ = fs::remove_file(&cpx_path); + } + + // Skip if already loaded (e.g., from a previous call) + if self.volumes.contains_key(&vid) { + continue; + } + + match Volume::new( + &self.directory, + &self.idx_directory, + &collection, + vid, + needle_map_kind, + None, // replica placement read from superblock + None, // TTL read from superblock + 0, // no preallocate on load + Version::current(), + ) { + Ok(mut v) => { + v.location_disk_space_low = self.is_disk_space_low.clone(); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&collection, "volume"]) + .inc(); + self.volumes.insert(vid, v); + } + Err(e) => { + warn!(volume_id = vid.0, error = %e, "failed to load volume"); + } + } + } + + Ok(()) + } + + /// Validate EC volume shards: all shards must be same size, and if .dat exists, + /// need at least DATA_SHARDS_COUNT shards with size matching expected. + fn validate_ec_volume(&self, collection: &str, vid: VolumeId) -> bool { + let base = volume_file_name(&self.directory, collection, vid); + let dat_path = format!("{}.dat", base); + + let mut expected_shard_size: Option = None; + let dat_exists = std::path::Path::new(&dat_path).exists(); + + if dat_exists { + if let Ok(meta) = fs::metadata(&dat_path) { + expected_shard_size = Some(calculate_expected_shard_size(meta.len() as i64)); + } else { + return false; + } + } + + let mut shard_count = 0usize; + let mut actual_shard_size: Option = None; + const MAX_SHARD_COUNT: usize = 32; + + for i in 0..MAX_SHARD_COUNT { + let shard_path = format!("{}.ec{:02}", base, i); + match fs::metadata(&shard_path) { + Ok(meta) if meta.len() > 0 => { + let size = meta.len() as i64; + if let Some(prev) = actual_shard_size { + if size != prev { + warn!( + volume_id = vid.0, + shard = i, + size, + expected = prev, + "EC shard size mismatch" + ); + return false; + } + } else { + actual_shard_size = Some(size); + } + shard_count += 1; + } + Err(e) if e.kind() != io::ErrorKind::NotFound => { + warn!( + volume_id = vid.0, + shard = i, + error = %e, + "failed to stat EC shard" + ); + return false; + } + _ => {} // not found or zero size — skip + } + } + + // If .dat exists, validate shard size matches expected + if dat_exists { + if let (Some(actual), Some(expected)) = (actual_shard_size, expected_shard_size) { + if actual != expected { + warn!( + volume_id = vid.0, + actual_shard_size = actual, + expected_shard_size = expected, + "EC shard size doesn't match .dat file" + ); + return false; + } + } + } + + // Distributed EC (no .dat): any shard count is valid + if !dat_exists { + return true; + } + + // With .dat: need at least DATA_SHARDS_COUNT shards + if shard_count < DATA_SHARDS_COUNT { + warn!( + volume_id = vid.0, + shard_count, + required = DATA_SHARDS_COUNT, + "EC volume has .dat but too few shards" + ); + return false; + } + + true + } + + /// Remove all EC-related files for a volume. + fn remove_ec_volume_files(&self, collection: &str, vid: VolumeId) { + let base = volume_file_name(&self.directory, collection, vid); + let idx_base = volume_file_name(&self.idx_directory, collection, vid); + const MAX_SHARD_COUNT: usize = 32; + + // Remove index files from idx directory (.ecx, .ecj) + let _ = fs::remove_file(format!("{}.ecx", idx_base)); + let _ = fs::remove_file(format!("{}.ecj", idx_base)); + // Also try data directory in case .ecx/.ecj were created before -dir.idx was configured + if self.idx_directory != self.directory { + let _ = fs::remove_file(format!("{}.ecx", base)); + let _ = fs::remove_file(format!("{}.ecj", base)); + } + + // Remove all EC shard files (.ec00 ~ .ec31) + for i in 0..MAX_SHARD_COUNT { + let _ = fs::remove_file(format!("{}.ec{:02}", base, i)); + } + } + + /// Find a volume by ID. + pub fn find_volume(&self, vid: VolumeId) -> Option<&Volume> { + self.volumes.get(&vid) + } + + /// Find a volume by ID (mutable). + pub fn find_volume_mut(&mut self, vid: VolumeId) -> Option<&mut Volume> { + self.volumes.get_mut(&vid) + } + + /// Add a volume to this location. + pub fn set_volume(&mut self, vid: VolumeId, volume: Volume) { + let collection = volume.collection.clone(); + self.volumes.insert(vid, volume); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&collection, "volume"]) + .inc(); + } + + /// Create a new volume in this location. + pub fn create_volume( + &mut self, + vid: VolumeId, + collection: &str, + needle_map_kind: NeedleMapKind, + replica_placement: Option, + ttl: Option, + preallocate: u64, + version: Version, + ) -> Result<(), VolumeError> { + let mut v = Volume::new( + &self.directory, + &self.idx_directory, + collection, + vid, + needle_map_kind, + replica_placement, + ttl, + preallocate, + version, + )?; + v.location_disk_space_low = self.is_disk_space_low.clone(); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[collection, "volume"]) + .inc(); + self.volumes.insert(vid, v); + Ok(()) + } + + /// Remove and close a volume. + pub fn unload_volume(&mut self, vid: VolumeId) -> Option { + if let Some(mut v) = self.volumes.remove(&vid) { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&v.collection, "volume"]) + .dec(); + v.close(); + Some(v) + } else { + None + } + } + + /// Remove, close, and delete all files for a volume. + pub fn delete_volume(&mut self, vid: VolumeId, only_empty: bool) -> Result<(), VolumeError> { + if let Some(mut v) = self.volumes.remove(&vid) { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&v.collection, "volume"]) + .dec(); + v.destroy(only_empty)?; + Ok(()) + } else { + Err(VolumeError::NotFound) + } + } + + /// Delete all volumes in a collection. + pub fn delete_collection(&mut self, collection: &str) -> Result<(), VolumeError> { + let vids: Vec = self + .volumes + .iter() + .filter(|(_, v)| v.collection == collection && !v.is_compacting()) + .map(|(vid, _)| *vid) + .collect(); + + for vid in vids { + if let Some(mut v) = self.volumes.remove(&vid) { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&v.collection, "volume"]) + .dec(); + if let Err(e) = v.destroy(false) { + warn!(volume_id = vid.0, error = %e, "delete collection: failed to destroy volume"); + } + } + } + + let ec_vids: Vec = self + .ec_volumes + .iter() + .filter(|(_, v)| v.collection == collection) + .map(|(vid, _)| *vid) + .collect(); + + for vid in ec_vids { + if let Some(mut ec_vol) = self.ec_volumes.remove(&vid) { + for _ in 0..ec_vol.shard_count() { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[collection, "ec_shards"]) + .dec(); + } + ec_vol.destroy(); + } + } + Ok(()) + } + + // ---- Metrics ---- + + /// Number of volumes on this disk. + pub fn volumes_len(&self) -> usize { + self.volumes.len() + } + + /// Get all volume IDs, sorted. + pub fn volume_ids(&self) -> Vec { + let mut ids: Vec = self.volumes.keys().copied().collect(); + ids.sort(); + ids + } + + /// Iterate over all volumes. + pub fn iter_volumes(&self) -> impl Iterator { + self.volumes.iter() + } + + /// Number of free volume slots. + /// Matches Go's FindFreeLocation formula: + /// free = ((MaxVolumeCount - VolumesLen()) * DataShardsCount - EcShardCount()) / DataShardsCount + pub fn free_volume_count(&self) -> i32 { + use crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT; + let max = self.max_volume_count.load(Ordering::Relaxed); + let free_count = (max as i64 - self.volumes.len() as i64) + * DATA_SHARDS_COUNT as i64 + - self.ec_shard_count() as i64; + let effective_free = free_count / DATA_SHARDS_COUNT as i64; + if effective_free > 0 { + effective_free as i32 + } else { + 0 + } + } + + /// Iterate over all volumes. + pub fn volumes(&self) -> impl Iterator { + self.volumes.iter() + } + + /// Iterate over all volumes (mutable). + pub fn volumes_mut(&mut self) -> impl Iterator { + self.volumes.iter_mut() + } + + /// Sum of unused space in writable volumes (volumeSizeLimit - actual size per volume). + /// Used by auto-max-volume-count to estimate how many more volumes can fit. + pub fn unused_space(&self, volume_size_limit: u64) -> u64 { + let mut unused: u64 = 0; + for vol in self.volumes.values() { + if vol.is_read_only() { + continue; + } + let dat_size = vol.dat_file_size().unwrap_or(0); + let idx_size = vol.idx_file_size(); + let used = dat_size + idx_size; + if volume_size_limit > used { + unused += volume_size_limit - used; + } + } + unused + } + + /// Check disk space against min_free_space and update is_disk_space_low. + pub fn check_disk_space(&self) { + let (total, free) = get_disk_stats(&self.directory); + if total == 0 { + return; + } + let used = total.saturating_sub(free); + let is_low = match &self.min_free_space { + MinFreeSpace::Percent(pct) => { + let free_pct = (free as f64 / total as f64) * 100.0; + free_pct < *pct + } + MinFreeSpace::Bytes(min_bytes) => free < *min_bytes, + }; + self.is_disk_space_low.store(is_low, Ordering::Relaxed); + self.available_space.store(free, Ordering::Relaxed); + + // Update resource gauges + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "all"]) + .set(total as f64); + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "used"]) + .set(used as f64); + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "free"]) + .set(free as f64); + // "avail" is same as "free" for us (Go subtracts reserved blocks but we use statvfs f_bavail) + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "avail"]) + .set(free as f64); + } + + // ---- EC volume operations ---- + + /// Find an EC volume by ID. + pub fn find_ec_volume(&self, vid: VolumeId) -> Option<&EcVolume> { + self.ec_volumes.get(&vid) + } + + /// Find an EC volume by ID (mutable). + pub fn find_ec_volume_mut(&mut self, vid: VolumeId) -> Option<&mut EcVolume> { + self.ec_volumes.get_mut(&vid) + } + + /// Check if this location has an EC volume. + pub fn has_ec_volume(&self, vid: VolumeId) -> bool { + self.ec_volumes.contains_key(&vid) + } + + /// Remove an EC volume, returning it. + pub fn remove_ec_volume(&mut self, vid: VolumeId) -> Option { + self.ec_volumes.remove(&vid) + } + + /// Mount EC shards for a volume on this location. + pub fn mount_ec_shards( + &mut self, + vid: VolumeId, + collection: &str, + shard_ids: &[u32], + ) -> Result<(), VolumeError> { + let dir = self.directory.clone(); + let idx_dir = self.idx_directory.clone(); + let ec_vol = self + .ec_volumes + .entry(vid) + .or_insert_with(|| EcVolume::new(&dir, &idx_dir, collection, vid).unwrap()); + ec_vol.disk_type = self.disk_type.clone(); + + for &shard_id in shard_ids { + let shard = EcVolumeShard::new(&dir, collection, vid, shard_id as u8); + ec_vol.add_shard(shard).map_err(VolumeError::Io)?; + crate::metrics::VOLUME_GAUGE + .with_label_values(&[collection, "ec_shards"]) + .inc(); + } + Ok(()) + } + + /// Unmount EC shards for a volume on this location. + pub fn unmount_ec_shards(&mut self, vid: VolumeId, shard_ids: &[u32]) { + if let Some(ec_vol) = self.ec_volumes.get_mut(&vid) { + let collection = ec_vol.collection.clone(); + for &shard_id in shard_ids { + ec_vol.remove_shard(shard_id as u8); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&collection, "ec_shards"]) + .dec(); + } + if ec_vol.shard_count() == 0 { + let mut vol = self.ec_volumes.remove(&vid).unwrap(); + vol.close(); + } + } + } + + /// Total number of EC shards on this location. + pub fn ec_shard_count(&self) -> usize { + self.ec_volumes + .values() + .map(|ecv| ecv.shards.iter().filter(|s| s.is_some()).count()) + .sum() + } + + /// Iterate over all EC volumes. + pub fn ec_volumes(&self) -> impl Iterator { + self.ec_volumes.iter() + } + + /// Close all volumes. + pub fn close(&mut self) { + for (_, v) in self.volumes.iter_mut() { + v.close(); + } + self.volumes.clear(); + for (_, mut ec_vol) in self.ec_volumes.drain() { + ec_vol.close(); + } + } +} + +/// Get total and free disk space for a given path. +/// Returns (total_bytes, free_bytes). +pub fn get_disk_stats(path: &str) -> (u64, u64) { + #[cfg(unix)] + { + use std::ffi::CString; + let c_path = match CString::new(path) { + Ok(p) => p, + Err(_) => return (0, 0), + }; + unsafe { + let mut stat: libc::statvfs = std::mem::zeroed(); + if libc::statvfs(c_path.as_ptr(), &mut stat) == 0 { + let all = stat.f_blocks as u64 * stat.f_frsize as u64; + let free = stat.f_bavail as u64 * stat.f_frsize as u64; + return (all, free); + } + } + (0, 0) + } + #[cfg(not(unix))] + { + let _ = path; + (0, 0) + } +} + +/// Calculate expected EC shard size from .dat file size. +/// Matches Go's `calculateExpectedShardSize`: large blocks (1GB * data_shards) first, +/// then small blocks (1MB * data_shards) for the remainder. +fn calculate_expected_shard_size(dat_file_size: i64) -> i64 { + let large_batch_size = ERASURE_CODING_LARGE_BLOCK_SIZE as i64 * DATA_SHARDS_COUNT as i64; + let num_large_batches = dat_file_size / large_batch_size; + let mut shard_size = num_large_batches * ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + let remaining = dat_file_size - (num_large_batches * large_batch_size); + + if remaining > 0 { + let small_batch_size = ERASURE_CODING_SMALL_BLOCK_SIZE as i64 * DATA_SHARDS_COUNT as i64; + // Ceiling division + let num_small_batches = (remaining + small_batch_size - 1) / small_batch_size; + shard_size += num_small_batches * ERASURE_CODING_SMALL_BLOCK_SIZE as i64; + } + + shard_size +} + +/// Parse a volume filename like "collection_42.dat" or "42.dat" into (collection, VolumeId). +fn parse_volume_filename(filename: &str) -> Option<(String, VolumeId)> { + let stem = filename + .strip_suffix(".dat") + .or_else(|| filename.strip_suffix(".vif")) + .or_else(|| filename.strip_suffix(".idx"))?; + if let Some(pos) = stem.rfind('_') { + let collection = &stem[..pos]; + let id_str = &stem[pos + 1..]; + let id: u32 = id_str.parse().ok()?; + Some((collection.to_string(), VolumeId(id))) + } else { + let id: u32 = stem.parse().ok()?; + Some((String::new(), VolumeId(id))) + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_parse_volume_filename() { + assert_eq!( + parse_volume_filename("42.dat"), + Some(("".to_string(), VolumeId(42))) + ); + assert_eq!( + parse_volume_filename("pics_7.dat"), + Some(("pics".to_string(), VolumeId(7))) + ); + assert_eq!( + parse_volume_filename("42.vif"), + Some(("".to_string(), VolumeId(42))) + ); + assert_eq!( + parse_volume_filename("pics_7.idx"), + Some(("pics".to_string(), VolumeId(7))) + ); + assert_eq!(parse_volume_filename("notadat.idx"), None); + assert_eq!(parse_volume_filename("bad.dat"), None); + } + + #[test] + fn test_disk_location_create_volume() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + loc.create_volume( + VolumeId(1), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert_eq!(loc.volumes_len(), 1); + assert!(loc.find_volume(VolumeId(1)).is_some()); + assert!(loc.find_volume(VolumeId(99)).is_none()); + assert_eq!(loc.free_volume_count(), 9); + } + + #[test] + fn test_disk_location_load_existing() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Create volumes + { + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + loc.create_volume( + VolumeId(1), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(2), + "test", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.close(); + } + + // Reload + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + loc.load_existing_volumes(NeedleMapKind::InMemory).unwrap(); + assert_eq!(loc.volumes_len(), 2); + + let ids = loc.volume_ids(); + assert!(ids.contains(&VolumeId(1))); + assert!(ids.contains(&VolumeId(2))); + } + + #[test] + fn test_disk_location_delete_volume() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + loc.create_volume( + VolumeId(1), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(2), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert_eq!(loc.volumes_len(), 2); + + loc.delete_volume(VolumeId(1), false).unwrap(); + assert_eq!(loc.volumes_len(), 1); + assert!(loc.find_volume(VolumeId(1)).is_none()); + } + + #[test] + fn test_disk_location_delete_collection() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + loc.create_volume( + VolumeId(1), + "pics", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(2), + "pics", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(3), + "docs", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert_eq!(loc.volumes_len(), 3); + + loc.delete_collection("pics").unwrap(); + assert_eq!(loc.volumes_len(), 1); + assert!(loc.find_volume(VolumeId(3)).is_some()); + } + + #[test] + fn test_disk_location_delete_collection_removes_ec_volumes() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let shard_path = format!("{}/pics_7.ec00", dir); + std::fs::write(&shard_path, b"ec-shard").unwrap(); + + loc.mount_ec_shards(VolumeId(7), "pics", &[0]).unwrap(); + assert!(loc.has_ec_volume(VolumeId(7))); + assert!(std::path::Path::new(&shard_path).exists()); + assert!(std::path::Path::new(&format!("{}/pics_7.ecj", dir)).exists()); + + loc.delete_collection("pics").unwrap(); + + assert!(!loc.has_ec_volume(VolumeId(7))); + assert!(!std::path::Path::new(&shard_path).exists()); + assert!(!std::path::Path::new(&format!("{}/pics_7.ecj", dir)).exists()); + } + + #[test] + fn test_disk_location_persists_directory_uuid_and_tags() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + vec!["fast".to_string(), "ssd".to_string()], + ) + .unwrap(); + let directory_uuid = loc.directory_uuid.clone(); + assert_eq!(loc.tags, vec!["fast".to_string(), "ssd".to_string()]); + drop(loc); + + let reloaded = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + assert_eq!(reloaded.directory_uuid, directory_uuid); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_decoder.rs b/seaweed-volume/src/storage/erasure_coding/ec_decoder.rs new file mode 100644 index 000000000..045cd644a --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_decoder.rs @@ -0,0 +1,261 @@ +//! EC decoding: reconstruct a .dat file from EC shards. +//! +//! Rebuilds the original .dat + .idx files from data shards (.ec00-.ec09) +//! and the sorted index (.ecx) + deletion journal (.ecj). + +use std::fs::File; +use std::io::{self, Read, Write}; + +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::idx; +use crate::storage::needle::needle::get_actual_size; +use crate::storage::super_block::SUPER_BLOCK_SIZE; +use crate::storage::types::*; +use crate::storage::volume::volume_file_name; + +/// Calculate .dat file size from the max offset entry in .ecx. +/// Reads the volume version from the first EC shard (.ec00) superblock, +/// then scans .ecx entries to find the largest (offset + needle_actual_size). +pub fn find_dat_file_size(dir: &str, collection: &str, volume_id: VolumeId) -> io::Result { + let base = volume_file_name(dir, collection, volume_id); + + // Read volume version from .ec00 superblock + let ec00_path = format!("{}.ec00", base); + let mut ec00 = File::open(&ec00_path)?; + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + ec00.read_exact(&mut sb_buf)?; + let version = Version(sb_buf[0]); + + // Start with at least the superblock size + let mut dat_size: i64 = SUPER_BLOCK_SIZE as i64; + + // Scan .ecx entries + let ecx_path = format!("{}.ecx", base); + let ecx_data = std::fs::read(&ecx_path)?; + let entry_count = ecx_data.len() / NEEDLE_MAP_ENTRY_SIZE; + + for i in 0..entry_count { + let start = i * NEEDLE_MAP_ENTRY_SIZE; + let (_, offset, size) = + idx_entry_from_bytes(&ecx_data[start..start + NEEDLE_MAP_ENTRY_SIZE]); + if size.is_deleted() { + continue; + } + let entry_stop = offset.to_actual_offset() + get_actual_size(size, version); + if entry_stop > dat_size { + dat_size = entry_stop; + } + } + + Ok(dat_size) +} + +/// Reconstruct a .dat file from EC data shards. +/// +/// Reads from .ec00-.ec09 and writes a new .dat file. +pub fn write_dat_file_from_shards( + dir: &str, + collection: &str, + volume_id: VolumeId, + dat_file_size: i64, + data_shards: usize, +) -> io::Result<()> { + let base = volume_file_name(dir, collection, volume_id); + let dat_path = format!("{}.dat", base); + + // Open data shards + let mut shards: Vec = (0..data_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + for shard in &mut shards { + shard.open()?; + } + + let mut dat_file = File::create(&dat_path)?; + let mut remaining = dat_file_size; + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE; + let small_block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let large_row_size = (large_block_size * data_shards) as i64; + + let mut shard_offset: u64 = 0; + + // Read large blocks + while remaining >= large_row_size { + for i in 0..data_shards { + let mut buf = vec![0u8; large_block_size]; + shards[i].read_at(&mut buf, shard_offset)?; + let to_write = large_block_size.min(remaining as usize); + dat_file.write_all(&buf[..to_write])?; + remaining -= to_write as i64; + if remaining <= 0 { + break; + } + } + shard_offset += large_block_size as u64; + } + + // Read small blocks + while remaining > 0 { + for i in 0..data_shards { + let mut buf = vec![0u8; small_block_size]; + shards[i].read_at(&mut buf, shard_offset)?; + let to_write = small_block_size.min(remaining as usize); + dat_file.write_all(&buf[..to_write])?; + remaining -= to_write as i64; + if remaining <= 0 { + break; + } + } + shard_offset += small_block_size as u64; + } + + for shard in &mut shards { + shard.close(); + } + + dat_file.sync_all()?; + Ok(()) +} + +/// Write .idx file from .ecx index + .ecj deletion journal. +/// +/// Copies sorted .ecx entries to .idx, then appends tombstones for +/// deleted needles from .ecj. +pub fn write_idx_file_from_ec_index( + dir: &str, + collection: &str, + volume_id: VolumeId, +) -> io::Result<()> { + let base = volume_file_name(dir, collection, volume_id); + let ecx_path = format!("{}.ecx", base); + let ecj_path = format!("{}.ecj", base); + let idx_path = format!("{}.idx", base); + + // Copy .ecx to .idx + std::fs::copy(&ecx_path, &idx_path)?; + + // Append deletions from .ecj as tombstones + if std::path::Path::new(&ecj_path).exists() { + let ecj_data = std::fs::read(&ecj_path)?; + if !ecj_data.is_empty() { + let mut idx_file = std::fs::OpenOptions::new() + .write(true) + .append(true) + .open(&idx_path)?; + + let count = ecj_data.len() / NEEDLE_ID_SIZE; + for i in 0..count { + let start = i * NEEDLE_ID_SIZE; + let needle_id = NeedleId::from_bytes(&ecj_data[start..start + NEEDLE_ID_SIZE]); + idx::write_index_entry( + &mut idx_file, + needle_id, + Offset::default(), + TOMBSTONE_FILE_SIZE, + )?; + } + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::erasure_coding::ec_encoder; + use crate::storage::needle::needle::Needle; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::volume::Volume; + use tempfile::TempDir; + + #[test] + fn test_ec_full_round_trip() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Create volume with data + let mut v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + let test_data: Vec<(NeedleId, Vec)> = (1..=3) + .map(|i| { + let data = format!("EC round trip data for needle {}", i); + (NeedleId(i), data.into_bytes()) + }) + .collect(); + + for (id, data) in &test_data { + let mut n = Needle { + id: *id, + cookie: Cookie(id.0 as u32), + data: data.clone(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + let original_dat_size = v.dat_file_size().unwrap(); + v.close(); + + // Read original .dat for comparison + let original_dat = std::fs::read(format!("{}/1.dat", dir)).unwrap(); + + // Encode to EC + let data_shards = 10; + let parity_shards = 4; + ec_encoder::write_ec_files(dir, dir, "", VolumeId(1), data_shards, parity_shards).unwrap(); + + // Delete original .dat and .idx + std::fs::remove_file(format!("{}/1.dat", dir)).unwrap(); + std::fs::remove_file(format!("{}/1.idx", dir)).unwrap(); + + // Reconstruct from EC shards + write_dat_file_from_shards(dir, "", VolumeId(1), original_dat_size as i64, data_shards) + .unwrap(); + write_idx_file_from_ec_index(dir, "", VolumeId(1)).unwrap(); + + // Verify reconstructed .dat matches original + let reconstructed_dat = std::fs::read(format!("{}/1.dat", dir)).unwrap(); + assert_eq!( + original_dat[..original_dat_size as usize], + reconstructed_dat[..original_dat_size as usize], + "reconstructed .dat should match original" + ); + + // Verify we can load and read from reconstructed volume + let v2 = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + for (id, expected_data) in &test_data { + let mut n = Needle { + id: *id, + ..Needle::default() + }; + v2.read_needle(&mut n).unwrap(); + assert_eq!(&n.data, expected_data, "needle {} data should match", id); + } + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_encoder.rs b/seaweed-volume/src/storage/erasure_coding/ec_encoder.rs new file mode 100644 index 000000000..b98db9fb0 --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_encoder.rs @@ -0,0 +1,824 @@ +//! EC encoding: convert a .dat file into 10 data + 4 parity shards. +//! +//! Uses Reed-Solomon erasure coding. The .dat file is split into blocks +//! (1GB large, 1MB small) and encoded across 14 shard files. + +use std::fs::File; +use std::io; +#[cfg(not(unix))] +use std::io::{Seek, SeekFrom}; + +use reed_solomon_erasure::galois_8::ReedSolomon; + +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::idx; +use crate::storage::types::*; +use crate::storage::volume::volume_file_name; + +/// Encode a .dat file into EC shard files. +/// +/// Creates .ec00-.ec13 files in the same directory. +/// Also creates a sorted .ecx index from the .idx file. +pub fn write_ec_files( + dir: &str, + idx_dir: &str, + collection: &str, + volume_id: VolumeId, + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + let base = volume_file_name(dir, collection, volume_id); + let dat_path = format!("{}.dat", base); + let idx_base = volume_file_name(idx_dir, collection, volume_id); + let idx_path = format!("{}.idx", idx_base); + + // Create sorted .ecx from .idx + write_sorted_ecx_from_idx(&idx_path, &format!("{}.ecx", base))?; + + // Encode .dat into shards + let dat_file = File::open(&dat_path)?; + let dat_size = dat_file.metadata()?.len() as i64; + + let rs = ReedSolomon::new(data_shards, parity_shards) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("reed-solomon init: {:?}", e)))?; + + // Create shard files + let total_shards = data_shards + parity_shards; + let mut shards: Vec = (0..total_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + for shard in &mut shards { + shard.create()?; + } + + // Encode in large blocks, then small blocks + encode_dat_file( + &dat_file, + dat_size, + &rs, + &mut shards, + data_shards, + parity_shards, + )?; + + // Close all shards + for shard in &mut shards { + shard.close(); + } + + Ok(()) +} + +/// Rebuild missing EC shard files from existing shards using Reed-Solomon reconstruct. +/// +/// This does not require the `.dat` file, only the existing `.ecXX` shard files. +pub fn rebuild_ec_files( + dir: &str, + collection: &str, + volume_id: VolumeId, + missing_shard_ids: &[u32], + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + if missing_shard_ids.is_empty() { + return Ok(()); + } + + let rs = ReedSolomon::new(data_shards, parity_shards) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("reed-solomon init: {:?}", e)))?; + + let total_shards = data_shards + parity_shards; + let mut shards: Vec = (0..total_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + // Determine the exact shard size from the first available existing shard + let mut shard_size = 0; + for (i, shard) in shards.iter_mut().enumerate() { + if !missing_shard_ids.contains(&(i as u32)) { + if let Ok(_) = shard.open() { + let size = shard.file_size(); + if size > shard_size { + shard_size = size; + } + } else { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("missing non-rebuild shard {}", i), + )); + } + } + } + + if shard_size == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "all existing shards are empty or cannot find an existing shard to determine size", + )); + } + + // Create the missing shards for writing + for i in missing_shard_ids { + if let Some(shard) = shards.get_mut(*i as usize) { + shard.create()?; + } + } + + let block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let mut remaining = shard_size; + let mut offset: u64 = 0; + + // Process all data in blocks + while remaining > 0 { + let to_process = remaining.min(block_size as i64) as usize; + + // Allocate buffers for all shards. Option> is required by rs.reconstruct() + let mut buffers: Vec>> = vec![None; total_shards]; + + // Read available shards + for (i, shard) in shards.iter().enumerate() { + if !missing_shard_ids.contains(&(i as u32)) { + let mut buf = vec![0u8; to_process]; + shard.read_at(&mut buf, offset)?; + buffers[i] = Some(buf); + } + } + + // Reconstruct missing shards + rs.reconstruct(&mut buffers).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("reed-solomon reconstruct: {:?}", e), + ) + })?; + + // Write recovered data into the missing shards + for i in missing_shard_ids { + let idx = *i as usize; + if let Some(buf) = buffers[idx].take() { + shards[idx].write_all(&buf)?; + } + } + + offset += to_process as u64; + remaining -= to_process as i64; + } + + // Close all shards + for shard in &mut shards { + shard.close(); + } + + Ok(()) +} + +/// Verify EC shards by computing parity against the existing data and identifying corrupted shards. +pub fn verify_ec_shards( + dir: &str, + collection: &str, + volume_id: VolumeId, + data_shards: usize, + parity_shards: usize, +) -> io::Result<(Vec, Vec)> { + let rs = ReedSolomon::new(data_shards, parity_shards) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("reed-solomon init: {:?}", e)))?; + + let total_shards = data_shards + parity_shards; + let mut shards: Vec = (0..total_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + let mut shard_size = 0; + let mut broken_shards = std::collections::HashSet::new(); + let mut details = Vec::new(); + + for (i, shard) in shards.iter_mut().enumerate() { + if let Ok(_) = shard.open() { + let size = shard.file_size(); + if size > shard_size { + shard_size = size; + } + } else { + broken_shards.insert(i as u32); + details.push(format!("failed to open or missing shard {}", i)); + } + } + + if shard_size == 0 || broken_shards.len() >= parity_shards { + // Can't do much if we don't know the size or have too many missing + return Ok((broken_shards.into_iter().collect(), details)); + } + + let block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let mut remaining = shard_size; + let mut offset: u64 = 0; + + while remaining > 0 { + let to_process = remaining.min(block_size as i64) as usize; + let mut buffers = vec![vec![0u8; to_process]; total_shards]; + + let mut read_failed = false; + for i in 0..total_shards { + if !broken_shards.contains(&(i as u32)) { + if let Err(e) = shards[i].read_at(&mut buffers[i], offset) { + broken_shards.insert(i as u32); + details.push(format!("read error shard {}: {}", i, e)); + read_failed = true; + } + } else { + read_failed = true; + } + } + + // Only do verification if all shards were readable + if !read_failed { + // Need to convert Vec> to &[&[u8]] for rs.verify + let slice_ptrs: Vec<&[u8]> = buffers.iter().map(|v| v.as_slice()).collect(); + if let Ok(is_valid) = rs.verify(&slice_ptrs) { + if !is_valid { + // Reed-Solomon verification failed. We cannot easily pinpoint which shard + // is corrupted without recalculating parities or syndromes, so we just + // log that this batch has corruption. Wait, we can test each parity shard! + // Let's re-encode from the first `data_shards` and compare to the actual `parity_shards`. + + let mut verify_buffers = buffers.clone(); + // Clear the parity parts + for i in data_shards..total_shards { + verify_buffers[i].fill(0); + } + if rs.encode(&mut verify_buffers).is_ok() { + for i in 0..total_shards { + if buffers[i] != verify_buffers[i] { + broken_shards.insert(i as u32); + details.push(format!( + "parity mismatch on shard {} at offset {}", + i, offset + )); + } + } + } + } + } + } + + offset += to_process as u64; + remaining -= to_process as i64; + } + + // Close all shards + for shard in &mut shards { + shard.close(); + } + + let mut broken_vec: Vec = broken_shards.into_iter().collect(); + broken_vec.sort_unstable(); + + Ok((broken_vec, details)) +} + +/// Write sorted .ecx index from .idx file. +fn write_sorted_ecx_from_idx(idx_path: &str, ecx_path: &str) -> io::Result<()> { + if !std::path::Path::new(idx_path).exists() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + "idx file not found", + )); + } + + // Read all idx entries + let mut idx_file = File::open(idx_path)?; + let mut entries: Vec<(NeedleId, Offset, Size)> = Vec::new(); + + idx::walk_index_file(&mut idx_file, 0, |key, offset, size| { + entries.push((key, offset, size)); + Ok(()) + })?; + + // Sort by NeedleId, then by actual offset so later entries come last + entries.sort_by_key(|&(key, offset, _)| (key, offset.to_actual_offset())); + + // Remove duplicates (keep last/latest entry for each key). + // dedup_by_key keeps the first in each run, so we reverse first, + // dedup, then reverse back. + entries.reverse(); + entries.dedup_by_key(|entry| entry.0); + entries.reverse(); + + // Write sorted entries to .ecx + let mut ecx_file = File::create(ecx_path)?; + for &(key, offset, size) in &entries { + idx::write_index_entry(&mut ecx_file, key, offset, size)?; + } + + Ok(()) +} + +/// Rebuild the .ecx index file by walking needles in the EC data shards. +/// +/// This is the equivalent of Go's `RebuildEcxFile`. It reads the logical .dat +/// content from the EC data shards, walks through needle headers to extract +/// (needle_id, offset, size) entries, deduplicates them, and writes a sorted +/// .ecx index file. +pub fn rebuild_ecx_file( + dir: &str, + collection: &str, + volume_id: VolumeId, + data_shards: usize, +) -> io::Result<()> { + use crate::storage::needle::needle::get_actual_size; + use crate::storage::super_block::SUPER_BLOCK_SIZE; + + let base = volume_file_name(dir, collection, volume_id); + let ecx_path = format!("{}.ecx", base); + + // Open data shards to read logical .dat content + let mut shards: Vec = (0..data_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + for shard in &mut shards { + if let Err(_) = shard.open() { + // If a data shard is missing, we can't rebuild ecx + for s in &mut shards { + s.close(); + } + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("cannot open data shard for ecx rebuild"), + )); + } + } + + // Determine total logical data size from shard sizes + let shard_size = shards.iter().map(|s| s.file_size()).max().unwrap_or(0); + let total_data_size = shard_size as i64 * data_shards as i64; + + // Read version from superblock (first byte of logical data) + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + read_from_data_shards(&shards, &mut sb_buf, 0, data_shards)?; + let version = Version(sb_buf[0]); + + // Walk needles starting after superblock + let mut offset = SUPER_BLOCK_SIZE as i64; + let header_size = NEEDLE_HEADER_SIZE; + let mut entries: Vec<(NeedleId, Offset, Size)> = Vec::new(); + + while offset + header_size as i64 <= total_data_size { + // Read needle header (cookie + needle_id + size = 16 bytes) + let mut header_buf = [0u8; NEEDLE_HEADER_SIZE]; + if read_from_data_shards(&shards, &mut header_buf, offset as u64, data_shards).is_err() { + break; + } + + let cookie = Cookie::from_bytes(&header_buf[..COOKIE_SIZE]); + let needle_id = NeedleId::from_bytes(&header_buf[COOKIE_SIZE..COOKIE_SIZE + NEEDLE_ID_SIZE]); + let size = Size::from_bytes(&header_buf[COOKIE_SIZE + NEEDLE_ID_SIZE..header_size]); + + // Validate: stop if we hit zero cookie+id (end of data) + if cookie.0 == 0 && needle_id.0 == 0 { + break; + } + + // Validate size is reasonable + if size.0 < 0 && !size.is_deleted() { + break; + } + + let actual_size = get_actual_size(size, version); + if actual_size <= 0 || offset + actual_size > total_data_size { + break; + } + + entries.push((needle_id, Offset::from_actual_offset(offset), size)); + + // Advance to next needle (aligned to NEEDLE_PADDING_SIZE) + offset += actual_size; + let padding_rem = offset % NEEDLE_PADDING_SIZE as i64; + if padding_rem != 0 { + offset += NEEDLE_PADDING_SIZE as i64 - padding_rem; + } + } + + for shard in &mut shards { + shard.close(); + } + + // Sort by NeedleId, then by offset (later entries override earlier) + entries.sort_by_key(|&(key, offset, _)| (key, offset.to_actual_offset())); + + // Deduplicate: keep latest entry per needle_id + entries.reverse(); + entries.dedup_by_key(|entry| entry.0); + entries.reverse(); + + // Write sorted .ecx + let mut ecx_file = File::create(&ecx_path)?; + for &(key, offset, size) in &entries { + idx::write_index_entry(&mut ecx_file, key, offset, size)?; + } + ecx_file.sync_all()?; + + Ok(()) +} + +/// Read bytes from EC data shards at a logical offset in the .dat file. +fn read_from_data_shards( + shards: &[EcVolumeShard], + buf: &mut [u8], + logical_offset: u64, + data_shards: usize, +) -> io::Result<()> { + let small_block = ERASURE_CODING_SMALL_BLOCK_SIZE as u64; + let data_shards_u64 = data_shards as u64; + + let mut bytes_read = 0u64; + let mut remaining = buf.len() as u64; + let mut current_offset = logical_offset; + + while remaining > 0 { + // Determine which shard and at what shard-offset this logical offset maps to. + // The data is interleaved: large blocks first, then small blocks. + // For simplicity, use the small block size for all calculations since + // large blocks are multiples of small blocks. + let row_size = small_block * data_shards_u64; + let row_index = current_offset / row_size; + let row_offset = current_offset % row_size; + let shard_index = (row_offset / small_block) as usize; + let shard_offset = row_index * small_block + (row_offset % small_block); + + if shard_index >= data_shards { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "shard index out of range", + )); + } + + // How many bytes can we read from this position in this shard block + let bytes_left_in_block = small_block - (row_offset % small_block); + let to_read = remaining.min(bytes_left_in_block) as usize; + + let dest = &mut buf[bytes_read as usize..bytes_read as usize + to_read]; + shards[shard_index].read_at(dest, shard_offset)?; + + bytes_read += to_read as u64; + remaining -= to_read as u64; + current_offset += to_read as u64; + } + + Ok(()) +} + +/// Encode the .dat file data into shard files. +/// +/// Uses a two-phase approach matching Go's ec_encoder.go: +/// 1. Process as many large blocks (1GB) as possible +/// 2. Process remaining data with small blocks (1MB) +fn encode_dat_file( + dat_file: &File, + dat_size: i64, + rs: &ReedSolomon, + shards: &mut [EcVolumeShard], + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + let mut remaining = dat_size; + let mut offset: u64 = 0; + + // Phase 1: Process large blocks (1GB each) while enough data remains + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE; + let large_row_size = large_block_size * data_shards; + + while remaining >= large_row_size as i64 { + encode_one_batch( + dat_file, + offset, + large_block_size, + rs, + shards, + data_shards, + parity_shards, + )?; + offset += large_row_size as u64; + remaining -= large_row_size as i64; + } + + // Phase 2: Process remaining data with small blocks (1MB each) + let small_block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let small_row_size = small_block_size * data_shards; + + while remaining > 0 { + let to_process = remaining.min(small_row_size as i64); + encode_one_batch( + dat_file, + offset, + small_block_size, + rs, + shards, + data_shards, + parity_shards, + )?; + offset += to_process as u64; + remaining -= to_process; + } + + Ok(()) +} + +/// Encode one batch (row) of data. +fn encode_one_batch( + dat_file: &File, + offset: u64, + block_size: usize, + rs: &ReedSolomon, + shards: &mut [EcVolumeShard], + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + let total_shards = data_shards + parity_shards; + // Each batch allocates block_size * total_shards bytes. + // With large blocks (1 GiB) this is 14 GiB -- guard against OOM. + let total_alloc = block_size.checked_mul(total_shards).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "block_size * shard count overflows usize", + ) + })?; + // Large-block encoding uses 1 GiB * 14 shards = 14 GiB; allow up to 16 GiB. + const MAX_BATCH_ALLOC: usize = 16 * 1024 * 1024 * 1024; // 16 GiB safety limit + if total_alloc > MAX_BATCH_ALLOC { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "batch allocation too large ({} bytes, limit {} bytes); block_size={} shards={}", + total_alloc, MAX_BATCH_ALLOC, block_size, total_shards, + ), + )); + } + + // Allocate buffers for all shards + let mut buffers: Vec> = (0..total_shards).map(|_| vec![0u8; block_size]).collect(); + + // Read data shards from .dat file + for i in 0..data_shards { + let read_offset = offset + (i * block_size) as u64; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + dat_file.read_at(&mut buffers[i], read_offset)?; + } + + #[cfg(not(unix))] + { + let mut f = dat_file.try_clone()?; + f.seek(SeekFrom::Start(read_offset))?; + f.read(&mut buffers[i])?; + } + } + + // Encode parity shards + rs.encode(&mut buffers).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("reed-solomon encode: {:?}", e), + ) + })?; + + // Write all shard buffers to files + for (i, buf) in buffers.iter().enumerate() { + shards[i].write_all(buf)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::needle::needle::Needle; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::volume::Volume; + use tempfile::TempDir; + + #[test] + fn test_ec_encode_decode_round_trip() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Create a volume with some data + let mut v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + for i in 1..=5 { + let data = format!("test data for needle {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + v.close(); + + // Encode to EC shards + let data_shards = 10; + let parity_shards = 4; + let total_shards = data_shards + parity_shards; + write_ec_files(dir, dir, "", VolumeId(1), data_shards, parity_shards).unwrap(); + + // Verify shard files exist + for i in 0..total_shards { + let path = format!("{}/{}.ec{:02}", dir, 1, i); + assert!( + std::path::Path::new(&path).exists(), + "shard file {} should exist", + path + ); + } + + // Verify .ecx exists + let ecx_path = format!("{}/1.ecx", dir); + assert!(std::path::Path::new(&ecx_path).exists()); + } + + #[test] + fn test_reed_solomon_basic() { + let data_shards = 10; + let parity_shards = 4; + let total_shards = data_shards + parity_shards; + let rs = ReedSolomon::new(data_shards, parity_shards).unwrap(); + let block_size = 1024; + let mut shards: Vec> = (0..total_shards) + .map(|i| { + if i < data_shards { + vec![(i as u8).wrapping_mul(7); block_size] + } else { + vec![0u8; block_size] + } + }) + .collect(); + + // Encode + rs.encode(&mut shards).unwrap(); + + // Verify parity is non-zero (at least some) + let parity_nonzero: bool = shards[data_shards..] + .iter() + .any(|s| s.iter().any(|&b| b != 0)); + assert!(parity_nonzero); + + // Simulate losing 4 shards and reconstructing + let original_0 = shards[0].clone(); + let original_1 = shards[1].clone(); + + let mut shard_opts: Vec>> = shards.into_iter().map(Some).collect(); + shard_opts[0] = None; + shard_opts[1] = None; + shard_opts[2] = None; + shard_opts[3] = None; + + rs.reconstruct(&mut shard_opts).unwrap(); + + assert_eq!(shard_opts[0].as_ref().unwrap(), &original_0); + assert_eq!(shard_opts[1].as_ref().unwrap(), &original_1); + } + + /// EC encode must read .idx from a separate index directory when configured. + #[test] + fn test_ec_encode_with_separate_idx_dir() { + let dat_tmp = TempDir::new().unwrap(); + let idx_tmp = TempDir::new().unwrap(); + let dat_dir = dat_tmp.path().to_str().unwrap(); + let idx_dir = idx_tmp.path().to_str().unwrap(); + + // Create a volume with separate data and index directories + let mut v = Volume::new( + dat_dir, + idx_dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + for i in 1..=5 { + let data = format!("needle {} payload", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + v.close(); + + // Verify .dat is in data dir, .idx is in idx dir + assert!(std::path::Path::new(&format!("{}/1.dat", dat_dir)).exists()); + assert!(!std::path::Path::new(&format!("{}/1.idx", dat_dir)).exists()); + assert!(std::path::Path::new(&format!("{}/1.idx", idx_dir)).exists()); + assert!(!std::path::Path::new(&format!("{}/1.dat", idx_dir)).exists()); + + // EC encode with separate idx dir + let data_shards = 10; + let parity_shards = 4; + let total_shards = data_shards + parity_shards; + write_ec_files( + dat_dir, + idx_dir, + "", + VolumeId(1), + data_shards, + parity_shards, + ) + .unwrap(); + + // Verify all 14 shard files in data dir + for i in 0..total_shards { + let path = format!("{}/1.ec{:02}", dat_dir, i); + assert!( + std::path::Path::new(&path).exists(), + "shard {} should exist in data dir", + path + ); + } + + // Verify .ecx in data dir (not idx dir) + assert!(std::path::Path::new(&format!("{}/1.ecx", dat_dir)).exists()); + assert!(!std::path::Path::new(&format!("{}/1.ecx", idx_dir)).exists()); + + // Verify no shard files leaked into idx dir + for i in 0..total_shards { + let path = format!("{}/1.ec{:02}", idx_dir, i); + assert!( + !std::path::Path::new(&path).exists(), + "shard {} should NOT exist in idx dir", + path + ); + } + } + + /// EC encode should fail gracefully when .idx is only in the data dir + /// but we pass a wrong idx_dir. This guards against regressions where + /// write_ec_files ignores the idx_dir parameter. + #[test] + fn test_ec_encode_fails_with_wrong_idx_dir() { + let dat_tmp = TempDir::new().unwrap(); + let idx_tmp = TempDir::new().unwrap(); + let wrong_tmp = TempDir::new().unwrap(); + let dat_dir = dat_tmp.path().to_str().unwrap(); + let idx_dir = idx_tmp.path().to_str().unwrap(); + let wrong_dir = wrong_tmp.path().to_str().unwrap(); + + let mut v = Volume::new( + dat_dir, + idx_dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(1), + data: b"hello".to_vec(), + data_size: 5, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + v.sync_to_disk().unwrap(); + v.close(); + + // Should fail: .idx is in idx_dir, not wrong_dir + let result = write_ec_files(dat_dir, wrong_dir, "", VolumeId(1), 10, 4); + assert!( + result.is_err(), + "should fail when idx_dir doesn't contain .idx" + ); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_locate.rs b/seaweed-volume/src/storage/erasure_coding/ec_locate.rs new file mode 100644 index 000000000..4c1f06aa2 --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_locate.rs @@ -0,0 +1,223 @@ +//! EC data location: maps needle offset/size to shard intervals. +//! +//! Determines which shard(s) contain data for a given needle and at what +//! offsets within those shards. Handles both large (1GB) and small (1MB) +//! block sections. + +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::types::*; + +/// An interval to read from EC shards. +#[derive(Debug, Clone)] +pub struct Interval { + pub block_index: usize, + pub inner_block_offset: i64, + pub size: i64, + pub is_large_block: bool, + pub large_block_rows_count: usize, +} + +impl Interval { + pub fn to_shard_id_and_offset(&self, data_shards: u32) -> (ShardId, i64) { + let data_shards_usize = data_shards as usize; + let shard_id = (self.block_index % data_shards_usize) as ShardId; + let row_index = self.block_index / data_shards_usize; + + let block_size = if self.is_large_block { + ERASURE_CODING_LARGE_BLOCK_SIZE as i64 + } else { + ERASURE_CODING_SMALL_BLOCK_SIZE as i64 + }; + + let mut offset = row_index as i64 * block_size + self.inner_block_offset; + if !self.is_large_block { + // Small blocks come after large blocks in the shard file + offset += self.large_block_rows_count as i64 * ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + } + + (shard_id, offset) + } +} + +/// Locate the EC shard intervals needed to read data at the given offset and size. +/// +/// `shard_size` is the size of a single shard file. +pub fn locate_data(offset: i64, size: Size, shard_size: i64, data_shards: u32) -> Vec { + let mut intervals = Vec::new(); + let data_size = size.0 as i64; + + if data_size <= 0 || shard_size <= 0 { + return intervals; + } + + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + let small_block_size = ERASURE_CODING_SMALL_BLOCK_SIZE as i64; + let large_row_size = large_block_size * data_shards as i64; + let small_row_size = small_block_size * data_shards as i64; + + // Number of large block rows + let n_large_block_rows = if shard_size > 0 { + ((shard_size - 1) / large_block_size) as usize + } else { + 0 + }; + let large_section_size = n_large_block_rows as i64 * large_row_size; + + let mut remaining_offset = offset; + let mut remaining_size = data_size; + + // In large block section? + if remaining_offset < large_section_size { + let available_in_large = large_section_size - remaining_offset; + let to_read = remaining_size.min(available_in_large); + + add_intervals( + &mut intervals, + remaining_offset, + to_read, + large_block_size, + large_row_size, + true, + n_large_block_rows, + ); + + remaining_offset += to_read; + remaining_size -= to_read; + } + + // In small block section? + if remaining_size > 0 { + let small_offset = remaining_offset - large_section_size; + add_intervals( + &mut intervals, + small_offset, + remaining_size, + small_block_size, + small_row_size, + false, + n_large_block_rows, + ); + } + + intervals +} + +fn add_intervals( + intervals: &mut Vec, + offset: i64, + size: i64, + block_size: i64, + _row_size: i64, + is_large_block: bool, + large_block_rows_count: usize, +) { + let mut pos = offset; + let end = offset + size; + + while pos < end { + let block_index = (pos / block_size) as usize; + let inner_offset = pos % block_size; + let remaining_in_block = block_size - inner_offset; + let interval_size = remaining_in_block.min(end - pos); + + intervals.push(Interval { + block_index, + inner_block_offset: inner_offset, + size: interval_size, + is_large_block, + large_block_rows_count, + }); + + pos += interval_size; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_interval_to_shard_id() { + let data_shards = 10; + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + let _shard_size = 1024 * 1024; // Example shard size + + // Block index 0 → shard 0 + let interval = Interval { + block_index: 0, + inner_block_offset: 100, + size: 50, + is_large_block: true, + large_block_rows_count: 1, + }; + let (shard_id, offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 0); + assert_eq!(offset, 100); + + // Block index 5 → shard 5 + let interval = Interval { + block_index: 5, + inner_block_offset: 0, + size: 1024, + is_large_block: true, + large_block_rows_count: 1, + }; + let (shard_id, _offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 5); + + // Block index 12 (data_shards=10) → row_index 1, shard_id 2 + let interval = Interval { + block_index: 12, + inner_block_offset: 200, + size: 50, + is_large_block: true, + large_block_rows_count: 5, + }; + let (shard_id, offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 2); // 12 % 10 = 2 + assert_eq!(offset, large_block_size + 200); // row 1 offset + inner_block_offset + + // Block index 10 → shard 0 (second row) + let interval = Interval { + block_index: 10, + inner_block_offset: 0, + size: 100, + is_large_block: true, + large_block_rows_count: 2, + }; + let (shard_id, offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 0); + assert_eq!(offset, ERASURE_CODING_LARGE_BLOCK_SIZE as i64); // row 1 offset + } + + #[test] + fn test_locate_data_small_file() { + // Small file: 100 bytes at offset 50, shard size = 1MB + let intervals = locate_data(50, Size(100), 1024 * 1024, 10); + assert!(!intervals.is_empty()); + + // Should be a single small block interval (no large block rows for 1MB shard) + assert_eq!(intervals.len(), 1); + assert!(!intervals[0].is_large_block); + } + + #[test] + fn test_locate_data_empty() { + let intervals = locate_data(0, Size(0), 1024 * 1024, 10); + assert!(intervals.is_empty()); + } + + #[test] + fn test_small_block_after_large() { + let interval = Interval { + block_index: 0, + inner_block_offset: 0, + size: 100, + is_large_block: false, + large_block_rows_count: 2, + }; + let (_shard_id, offset) = interval.to_shard_id_and_offset(10); + // Should be after 2 large block rows + assert_eq!(offset, 2 * ERASURE_CODING_LARGE_BLOCK_SIZE as i64); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_shard.rs b/seaweed-volume/src/storage/erasure_coding/ec_shard.rs new file mode 100644 index 000000000..6a6a8d6ea --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_shard.rs @@ -0,0 +1,225 @@ +//! EcVolumeShard: a single shard file (.ec00-.ec13) of an erasure-coded volume. + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Write}; + +use crate::storage::types::*; + +pub const DATA_SHARDS_COUNT: usize = 10; +pub const PARITY_SHARDS_COUNT: usize = 4; +pub const TOTAL_SHARDS_COUNT: usize = DATA_SHARDS_COUNT + PARITY_SHARDS_COUNT; +pub const MAX_SHARD_COUNT: usize = 32; +pub const MIN_TOTAL_DISKS: usize = TOTAL_SHARDS_COUNT / PARITY_SHARDS_COUNT + 1; +pub const ERASURE_CODING_LARGE_BLOCK_SIZE: usize = 1024 * 1024 * 1024; // 1GB +pub const ERASURE_CODING_SMALL_BLOCK_SIZE: usize = 1024 * 1024; // 1MB + +pub type ShardId = u8; + +/// A single erasure-coded shard file. +pub struct EcVolumeShard { + pub volume_id: VolumeId, + pub shard_id: ShardId, + pub collection: String, + pub dir: String, + pub disk_type: DiskType, + ecd_file: Option, + ecd_file_size: i64, +} + +impl EcVolumeShard { + /// Create a new shard reference (does not open the file). + pub fn new(dir: &str, collection: &str, volume_id: VolumeId, shard_id: ShardId) -> Self { + EcVolumeShard { + volume_id, + shard_id, + collection: collection.to_string(), + dir: dir.to_string(), + disk_type: DiskType::default(), + ecd_file: None, + ecd_file_size: 0, + } + } + + /// Shard file name, e.g. "dir/collection_42.ec03" + pub fn file_name(&self) -> String { + let base = + crate::storage::volume::volume_file_name(&self.dir, &self.collection, self.volume_id); + format!("{}.ec{:02}", base, self.shard_id) + } + + /// Open the shard file for reading. + pub fn open(&mut self) -> io::Result<()> { + let path = self.file_name(); + let file = File::open(&path)?; + self.ecd_file_size = file.metadata()?.len() as i64; + self.ecd_file = Some(file); + Ok(()) + } + + /// Create the shard file for writing. + pub fn create(&mut self) -> io::Result<()> { + let path = self.file_name(); + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&path)?; + self.ecd_file = Some(file); + self.ecd_file_size = 0; + Ok(()) + } + + /// Read data at a specific offset. + pub fn read_at(&self, buf: &mut [u8], offset: u64) -> io::Result { + let file = self + .ecd_file + .as_ref() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "shard file not open"))?; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + file.read_at(buf, offset) + } + + #[cfg(not(unix))] + { + use std::io::{Read, Seek, SeekFrom}; + // File::read_at is unix-only; fall back to seek + read. + // We need a mutable reference for seek/read, so clone the handle. + let mut f = file.try_clone()?; + f.seek(SeekFrom::Start(offset))?; + f.read(buf) + } + } + + /// Write data to the shard file (appends). + pub fn write_all(&mut self, data: &[u8]) -> io::Result<()> { + let file = self + .ecd_file + .as_mut() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "shard file not open"))?; + file.write_all(data)?; + self.ecd_file_size += data.len() as i64; + Ok(()) + } + + pub fn file_size(&self) -> i64 { + self.ecd_file_size + } + + /// Close the shard file. + pub fn close(&mut self) { + if let Some(ref file) = self.ecd_file { + let _ = file.sync_all(); + } + self.ecd_file = None; + } + + /// Delete the shard file from disk. + pub fn destroy(&mut self) { + self.close(); + let _ = fs::remove_file(self.file_name()); + } +} + +/// ShardBits: bitmap tracking which shards are present. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct ShardBits(pub u32); + +impl ShardBits { + pub fn add_shard_id(&mut self, id: ShardId) { + assert!((id as usize) < 32, "shard id {} out of bounds (max 31)", id,); + self.0 |= 1 << id; + } + + pub fn remove_shard_id(&mut self, id: ShardId) { + assert!((id as usize) < 32, "shard id {} out of bounds (max 31)", id,); + self.0 &= !(1 << id); + } + + pub fn has_shard_id(&self, id: ShardId) -> bool { + if (id as usize) >= 32 { + return false; + } + self.0 & (1 << id) != 0 + } + + pub fn shard_id_count(&self) -> usize { + self.0.count_ones() as usize + } + + /// Iterator over present shard IDs. + pub fn shard_ids(&self) -> Vec { + let mut ids = Vec::with_capacity(self.shard_id_count()); + for i in 0..32 { + if self.has_shard_id(i) { + ids.push(i); + } + } + ids + } + + pub fn minus(&self, other: ShardBits) -> ShardBits { + ShardBits(self.0 & !other.0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_shard_bits() { + let mut bits = ShardBits::default(); + assert_eq!(bits.shard_id_count(), 0); + + bits.add_shard_id(0); + bits.add_shard_id(3); + bits.add_shard_id(13); + assert_eq!(bits.shard_id_count(), 3); + assert!(bits.has_shard_id(0)); + assert!(bits.has_shard_id(3)); + assert!(!bits.has_shard_id(1)); + + bits.remove_shard_id(3); + assert!(!bits.has_shard_id(3)); + assert_eq!(bits.shard_id_count(), 2); + } + + #[test] + fn test_shard_bits_ids() { + let mut bits = ShardBits::default(); + bits.add_shard_id(1); + bits.add_shard_id(5); + bits.add_shard_id(9); + assert_eq!(bits.shard_ids(), vec![1, 5, 9]); + } + + #[test] + fn test_shard_bits_minus() { + let mut a = ShardBits::default(); + a.add_shard_id(0); + a.add_shard_id(1); + a.add_shard_id(2); + + let mut b = ShardBits::default(); + b.add_shard_id(1); + + let c = a.minus(b); + assert_eq!(c.shard_ids(), vec![0, 2]); + } + + #[test] + fn test_shard_file_name() { + let shard = EcVolumeShard::new("/data", "pics", VolumeId(42), 3); + assert_eq!(shard.file_name(), "/data/pics_42.ec03"); + } + + #[test] + fn test_shard_file_name_no_collection() { + let shard = EcVolumeShard::new("/data", "", VolumeId(7), 13); + assert_eq!(shard.file_name(), "/data/7.ec13"); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_volume.rs b/seaweed-volume/src/storage/erasure_coding/ec_volume.rs new file mode 100644 index 000000000..24967c04a --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_volume.rs @@ -0,0 +1,944 @@ +//! EcVolume: an erasure-coded volume with up to 14 shards. +//! +//! Each EcVolume has a sorted index (.ecx) and a deletion journal (.ecj). +//! Shards (.ec00-.ec13) may be distributed across multiple servers. + +use std::collections::HashMap; +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Write}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use crate::pb::master_pb; +use crate::storage::erasure_coding::ec_locate; +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::needle::needle::{get_actual_size, Needle}; +use crate::storage::types::*; + +/// An erasure-coded volume managing its local shards and index. +pub struct EcVolume { + pub volume_id: VolumeId, + pub collection: String, + pub dir: String, + pub dir_idx: String, + pub version: Version, + pub shards: Vec>, // indexed by ShardId (0..14) + pub dat_file_size: i64, + pub data_shards: u32, + pub parity_shards: u32, + ecx_file: Option, + ecx_file_size: i64, + ecj_file: Option, + pub disk_type: DiskType, + /// Directory where .ecx/.ecj were actually found (may differ from dir_idx after fallback). + ecx_actual_dir: String, + /// Maps shard ID -> list of server addresses where that shard exists. + /// Used for distributed EC reads across the cluster. + pub shard_locations: HashMap>, + /// EC volume expiration time (unix epoch seconds), set during EC encode from TTL. + pub expire_at_sec: u64, +} + +pub fn read_ec_shard_config(dir: &str, collection: &str, volume_id: VolumeId) -> (u32, u32) { + let mut data_shards = crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT as u32; + let mut parity_shards = crate::storage::erasure_coding::ec_shard::PARITY_SHARDS_COUNT as u32; + let base = crate::storage::volume::volume_file_name(dir, collection, volume_id); + let vif_path = format!("{}.vif", base); + if let Ok(vif_content) = std::fs::read_to_string(&vif_path) { + if let Ok(vif_info) = + serde_json::from_str::(&vif_content) + { + if let Some(ec) = vif_info.ec_shard_config { + if ec.data_shards > 0 + && ec.parity_shards > 0 + && (ec.data_shards + ec.parity_shards) <= TOTAL_SHARDS_COUNT as u32 + { + data_shards = ec.data_shards; + parity_shards = ec.parity_shards; + } + } + } + } + (data_shards, parity_shards) +} + +impl EcVolume { + /// Create a new EcVolume. Loads .ecx index and .ecj journal if present. + pub fn new( + dir: &str, + dir_idx: &str, + collection: &str, + volume_id: VolumeId, + ) -> io::Result { + let (data_shards, parity_shards) = read_ec_shard_config(dir, collection, volume_id); + + let total_shards = (data_shards + parity_shards) as usize; + let mut shards = Vec::with_capacity(total_shards); + for _ in 0..total_shards { + shards.push(None); + } + + // Read expire_at_sec and version from .vif if present (matches Go's MaybeLoadVolumeInfo) + let (expire_at_sec, vif_version) = { + let base = crate::storage::volume::volume_file_name(dir, collection, volume_id); + let vif_path = format!("{}.vif", base); + if let Ok(vif_content) = std::fs::read_to_string(&vif_path) { + if let Ok(vif_info) = + serde_json::from_str::(&vif_content) + { + let ver = if vif_info.version > 0 { + Version(vif_info.version as u8) + } else { + Version::current() + }; + (vif_info.expire_at_sec, ver) + } else { + (0, Version::current()) + } + } else { + (0, Version::current()) + } + }; + + let mut vol = EcVolume { + volume_id, + collection: collection.to_string(), + dir: dir.to_string(), + dir_idx: dir_idx.to_string(), + version: vif_version, + shards, + dat_file_size: 0, + data_shards, + parity_shards, + ecx_file: None, + ecx_file_size: 0, + ecj_file: None, + disk_type: DiskType::default(), + ecx_actual_dir: dir_idx.to_string(), + shard_locations: HashMap::new(), + expire_at_sec, + }; + + // Open .ecx file (sorted index) in read/write mode for in-place deletion marking. + // Matches Go which opens ecx for writing via MarkNeedleDeleted. + let ecx_path = vol.ecx_file_name(); + if std::path::Path::new(&ecx_path).exists() { + let file = OpenOptions::new().read(true).write(true).open(&ecx_path)?; + vol.ecx_file_size = file.metadata()?.len() as i64; + vol.ecx_file = Some(file); + } else if dir_idx != dir { + // Fall back to data directory if .ecx was created before -dir.idx was configured + let data_base = crate::storage::volume::volume_file_name(dir, collection, volume_id); + let fallback_ecx = format!("{}.ecx", data_base); + if std::path::Path::new(&fallback_ecx).exists() { + tracing::info!( + volume_id = volume_id.0, + "ecx file not found in idx dir, falling back to data dir" + ); + let file = OpenOptions::new().read(true).write(true).open(&fallback_ecx)?; + vol.ecx_file_size = file.metadata()?.len() as i64; + vol.ecx_file = Some(file); + vol.ecx_actual_dir = dir.to_string(); + } + } + + // Replay .ecj journal into .ecx on startup (matches Go's RebuildEcxFile). + vol.rebuild_ecx_from_journal()?; + + // Open .ecj file (deletion journal) — use ecx_actual_dir for consistency + let ecj_base = + crate::storage::volume::volume_file_name(&vol.ecx_actual_dir, collection, volume_id); + let ecj_path = format!("{}.ecj", ecj_base); + let ecj_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .append(true) + .open(&ecj_path)?; + vol.ecj_file = Some(ecj_file); + + Ok(vol) + } + + // ---- File names ---- + + #[allow(dead_code)] + fn base_name(&self) -> String { + crate::storage::volume::volume_file_name(&self.dir, &self.collection, self.volume_id) + } + + fn idx_base_name(&self) -> String { + crate::storage::volume::volume_file_name(&self.dir_idx, &self.collection, self.volume_id) + } + + pub fn ecx_file_name(&self) -> String { + format!("{}.ecx", self.idx_base_name()) + } + + pub fn ecj_file_name(&self) -> String { + format!("{}.ecj", self.idx_base_name()) + } + + /// Sync the EC volume's journal and index files to disk (matching Go's ecv.Sync()). + /// Go flushes both .ecj and .ecx to ensure in-place deletion marks are persisted. + pub fn sync_to_disk(&self) -> io::Result<()> { + if let Some(ref ecj_file) = self.ecj_file { + ecj_file.sync_all()?; + } + if let Some(ref ecx_file) = self.ecx_file { + ecx_file.sync_all()?; + } + Ok(()) + } + + // ---- Shard management ---- + + /// Add a shard to this volume. + pub fn add_shard(&mut self, mut shard: EcVolumeShard) -> io::Result<()> { + let id = shard.shard_id as usize; + let total_shards = (self.data_shards + self.parity_shards) as usize; + if id >= total_shards { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid shard id: {} (max {})", id, total_shards - 1), + )); + } + shard.open()?; + self.shards[id] = Some(shard); + Ok(()) + } + + /// Remove and close a shard. + pub fn remove_shard(&mut self, shard_id: ShardId) { + if let Some(ref mut shard) = self.shards[shard_id as usize] { + shard.close(); + } + self.shards[shard_id as usize] = None; + } + + /// Get a ShardBits bitmap of locally available shards. + pub fn shard_bits(&self) -> ShardBits { + let mut bits = ShardBits::default(); + for (i, shard) in self.shards.iter().enumerate() { + if shard.is_some() { + bits.add_shard_id(i as ShardId); + } + } + bits + } + + /// Count of locally available shards. + pub fn shard_count(&self) -> usize { + self.shards.iter().filter(|s| s.is_some()).count() + } + + pub fn is_time_to_destroy(&self) -> bool { + self.expire_at_sec > 0 + && SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + > self.expire_at_sec + } + + pub fn to_volume_ec_shard_information_messages( + &self, + disk_id: u32, + ) -> Vec { + let mut ec_index_bits: u32 = 0; + let mut shard_sizes = Vec::new(); + for shard in self.shards.iter().flatten() { + ec_index_bits |= 1u32 << shard.shard_id; + shard_sizes.push(shard.file_size()); + } + + if ec_index_bits == 0 { + return Vec::new(); + } + + vec![master_pb::VolumeEcShardInformationMessage { + id: self.volume_id.0, + collection: self.collection.clone(), + ec_index_bits, + shard_sizes, + disk_type: self.disk_type.to_string(), + expire_at_sec: self.expire_at_sec, + disk_id, + ..Default::default() + }] + } + + // ---- Shard locations (distributed tracking) ---- + + /// Set the list of server addresses for a given shard ID. + pub fn set_shard_locations(&mut self, shard_id: ShardId, locations: Vec) { + self.shard_locations.insert(shard_id, locations); + } + + /// Get the list of server addresses for a given shard ID. + pub fn get_shard_locations(&self, shard_id: ShardId) -> &[String] { + self.shard_locations + .get(&shard_id) + .map(|v| v.as_slice()) + .unwrap_or(&[]) + } + + // ---- Index operations ---- + + /// Find a needle's offset and size in the sorted .ecx index via binary search. + pub fn find_needle_from_ecx(&self, needle_id: NeedleId) -> io::Result> { + let ecx_file = self + .ecx_file + .as_ref() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "ecx file not open"))?; + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + if entry_count == 0 { + return Ok(None); + } + + // Binary search + let mut lo: usize = 0; + let mut hi: usize = entry_count; + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + while lo < hi { + let mid = lo + (hi - lo) / 2; + let file_offset = (mid * NEEDLE_MAP_ENTRY_SIZE) as u64; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.read_exact_at(&mut entry_buf, file_offset)?; + } + + let (key, offset, size) = idx_entry_from_bytes(&entry_buf); + if key == needle_id { + return Ok(Some((offset, size))); + } else if key < needle_id { + lo = mid + 1; + } else { + hi = mid; + } + } + + Ok(None) + } + + /// Locate the EC shard intervals needed to read a needle. + pub fn locate_needle( + &self, + needle_id: NeedleId, + ) -> io::Result)>> { + let (offset, size) = match self.find_needle_from_ecx(needle_id)? { + Some((o, s)) => (o, s), + None => return Ok(None), + }; + + if size.is_deleted() || offset.is_zero() { + return Ok(None); + } + + // Match Go's LocateEcShardNeedleInterval: shardSize = shard.ecdFileSize - 1 + // Shards are usually padded to ErasureCodingSmallBlockSize, so subtract 1 + // to avoid off-by-one in large block row count calculation. + // If datFileSize is known, use datFileSize / DataShards instead. + let shard_size = if self.dat_file_size > 0 { + self.dat_file_size / self.data_shards as i64 + } else { + self.shard_file_size() - 1 + }; + // Pass the actual on-disk size (header+body+checksum+timestamp+padding) + // to locate_data, matching Go: types.Size(needle.GetActualSize(size, version)) + let actual = get_actual_size(size, self.version); + let intervals = ec_locate::locate_data( + offset.to_actual_offset(), + Size(actual as i32), + shard_size, + self.data_shards, + ); + + Ok(Some((offset, size, intervals))) + } + + /// Read a full needle from locally available EC shards. + /// + /// Locates the needle in the .ecx index, determines which shard intervals + /// contain its data, reads from local shards, and parses the result into + /// a fully populated Needle (including last_modified, checksum, ttl). + /// + /// Returns `Ok(None)` if the needle is not found or is deleted. + /// Returns an error if a required shard is not available locally. + pub fn read_ec_shard_needle(&self, needle_id: NeedleId) -> io::Result> { + let (offset, size, intervals) = match self.locate_needle(needle_id)? { + Some(v) => v, + None => return Ok(None), + }; + + if intervals.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "no intervals for needle", + )); + } + + // Compute the total bytes we need to read (full needle on disk) + let actual_size = get_actual_size(size, self.version) as usize; + let mut bytes = Vec::with_capacity(actual_size); + + for interval in &intervals { + let (shard_id, shard_offset) = interval.to_shard_id_and_offset(self.data_shards); + let shard = self + .shards + .get(shard_id as usize) + .and_then(|s| s.as_ref()) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::NotFound, + format!("ec shard {} not available locally", shard_id), + ) + })?; + + let mut buf = vec![0u8; interval.size as usize]; + shard.read_at(&mut buf, shard_offset as u64)?; + bytes.extend_from_slice(&buf); + } + + // Truncate to exact actual_size (intervals may span more than needed) + bytes.truncate(actual_size); + + if bytes.len() < actual_size { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "read {} bytes but need {} for needle {}", + bytes.len(), + actual_size, + needle_id + ), + )); + } + + let mut n = Needle::default(); + n.read_bytes(&bytes, offset.to_actual_offset(), size, self.version) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, format!("{}", e)))?; + + Ok(Some(n)) + } + + /// Get the size of a single shard (all shards are the same size). + fn shard_file_size(&self) -> i64 { + for shard in &self.shards { + if let Some(s) = shard { + return s.file_size(); + } + } + 0 + } + + /// Walk the .ecx index and return (file_count, file_deleted_count, total_size). + /// total_size sums size.Raw() for all entries (including deleted), matching Go's WalkIndex. + pub fn walk_ecx_stats(&self) -> io::Result<(u64, u64, u64)> { + let ecx_file = match self.ecx_file.as_ref() { + Some(f) => f, + None => return Ok((0, 0, 0)), + }; + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + let mut files: u64 = 0; + let mut files_deleted: u64 = 0; + let mut total_size: u64 = 0; + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + for i in 0..entry_count { + let file_offset = (i * NEEDLE_MAP_ENTRY_SIZE) as u64; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.read_exact_at(&mut entry_buf, file_offset)?; + } + let (_key, _offset, size) = idx_entry_from_bytes(&entry_buf); + // Match Go's Size.Raw(): tombstone (-1) returns 0, other negatives return abs + if !size.is_tombstone() { + total_size += size.0.unsigned_abs() as u64; + } + if size.is_deleted() { + files_deleted += 1; + } else { + files += 1; + } + } + + Ok((files, files_deleted, total_size)) + } + + /// ScrubIndex verifies index integrity of an EC volume. + /// Matches Go's `(ev *EcVolume) ScrubIndex()` → `idx.CheckIndexFile()`. + /// Returns (entry_count, errors). + pub fn scrub_index(&self) -> (u64, Vec) { + let ecx_file = match self.ecx_file.as_ref() { + Some(f) => f, + None => { + return ( + 0, + vec![format!( + "no ECX file associated with EC volume {}", + self.volume_id.0 + )], + ) + } + }; + + if self.ecx_file_size == 0 { + return ( + 0, + vec![format!( + "zero-size ECX file for EC volume {}", + self.volume_id.0 + )], + ); + } + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + let mut entries: Vec<(usize, NeedleId, i64, Size)> = Vec::with_capacity(entry_count); + let mut errs: Vec = Vec::new(); + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + // Walk all entries + for i in 0..entry_count { + let file_offset = (i * NEEDLE_MAP_ENTRY_SIZE) as u64; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + if let Err(e) = ecx_file.read_exact_at(&mut entry_buf, file_offset) { + errs.push(format!("read ecx entry {}: {}", i, e)); + continue; + } + } + let (key, offset, size) = idx_entry_from_bytes(&entry_buf); + entries.push((i, key, offset.to_actual_offset(), size)); + } + + // Sort by offset, then size + entries.sort_by(|a, b| a.2.cmp(&b.2).then(a.3 .0.cmp(&b.3 .0))); + + // Check for overlapping needles + for i in 1..entries.len() { + let (idx, id, offset, size) = entries[i]; + let (_, last_id, last_offset, last_size) = entries[i - 1]; + + let actual_size = + crate::storage::needle::needle::get_actual_size(size, self.version); + let end = if actual_size != 0 { + offset + actual_size - 1 + } else { + offset + }; + + let last_actual_size = + crate::storage::needle::needle::get_actual_size(last_size, self.version); + let last_end = if last_actual_size != 0 { + last_offset + last_actual_size - 1 + } else { + last_offset + }; + + if offset <= last_end { + errs.push(format!( + "needle {} (#{}) at [{}-{}] overlaps needle {} at [{}-{}]", + id.0, + idx + 1, + offset, + end, + last_id.0, + last_offset, + last_end + )); + } + } + + // Verify file size matches entry count + let expected_size = entry_count as i64 * NEEDLE_MAP_ENTRY_SIZE as i64; + if expected_size != self.ecx_file_size { + errs.push(format!( + "expected an index file of size {}, got {}", + expected_size, self.ecx_file_size + )); + } + + (entries.len() as u64, errs) + } + + // ---- Deletion ---- + + /// Mark a needle as deleted in the .ecx file in-place. + /// Matches Go's MarkNeedleDeleted: binary search the .ecx, then overwrite + /// the size field with TOMBSTONE_FILE_SIZE. + fn mark_needle_deleted_in_ecx(&self, needle_id: NeedleId) -> io::Result { + let ecx_file = match self.ecx_file.as_ref() { + Some(f) => f, + None => return Ok(false), + }; + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + if entry_count == 0 { + return Ok(false); + } + + // Binary search for the needle + let mut lo: usize = 0; + let mut hi: usize = entry_count; + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + while lo < hi { + let mid = lo + (hi - lo) / 2; + let file_offset = (mid * NEEDLE_MAP_ENTRY_SIZE) as u64; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.read_exact_at(&mut entry_buf, file_offset)?; + } + + let (key, _offset, _size) = idx_entry_from_bytes(&entry_buf); + if key == needle_id { + // Found — overwrite the size field with TOMBSTONE_FILE_SIZE + let size_offset = file_offset + NEEDLE_ID_SIZE as u64 + OFFSET_SIZE as u64; + let mut size_buf = [0u8; SIZE_SIZE]; + TOMBSTONE_FILE_SIZE.to_bytes(&mut size_buf); + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.write_all_at(&size_buf, size_offset)?; + } + return Ok(true); + } else if key < needle_id { + lo = mid + 1; + } else { + hi = mid; + } + } + + Ok(false) // not found + } + + /// Replay .ecj journal entries into .ecx on startup. + /// Matches Go's RebuildEcxFile: for each needle ID in .ecj, marks it + /// deleted in .ecx, then removes the .ecj file. + fn rebuild_ecx_from_journal(&mut self) -> io::Result<()> { + let ecj_path = self.ecj_file_name(); + if !std::path::Path::new(&ecj_path).exists() { + return Ok(()); + } + + let data = fs::read(&ecj_path)?; + if data.is_empty() { + return Ok(()); + } + + let count = data.len() / NEEDLE_ID_SIZE; + for i in 0..count { + let start = i * NEEDLE_ID_SIZE; + if start + NEEDLE_ID_SIZE > data.len() { + break; + } + let needle_id = NeedleId::from_bytes(&data[start..start + NEEDLE_ID_SIZE]); + // Errors for individual entries are non-fatal (needle may not exist in .ecx) + let _ = self.mark_needle_deleted_in_ecx(needle_id); + } + + // Remove the .ecj file after replay (matches Go) + let _ = fs::remove_file(&ecj_path); + + // Re-create .ecj for future deletions + let ecj_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .append(true) + .open(&ecj_path)?; + self.ecj_file = Some(ecj_file); + + Ok(()) + } + + // ---- Deletion journal ---- + + /// Append a deleted needle ID to the .ecj journal and mark in .ecx. + /// Matches Go's DeleteNeedleFromEcx: marks in .ecx first, then journals. + pub fn journal_delete(&mut self, needle_id: NeedleId) -> io::Result<()> { + // Mark deleted in .ecx in-place (matches Go's MarkNeedleDeleted) + let _ = self.mark_needle_deleted_in_ecx(needle_id); + let ecj_file = self + .ecj_file + .as_mut() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "ecj file not open"))?; + + let mut buf = [0u8; NEEDLE_ID_SIZE]; + needle_id.to_bytes(&mut buf); + ecj_file.write_all(&buf)?; + ecj_file.sync_all()?; + Ok(()) + } + + /// Append a deleted needle ID to the .ecj journal, validating the cookie first. + /// Matches Go's DeleteEcShardNeedle which validates cookie before journaling. + /// A cookie of 0 means skip cookie check (e.g., orphan cleanup). + pub fn journal_delete_with_cookie( + &mut self, + needle_id: NeedleId, + cookie: crate::storage::types::Cookie, + ) -> io::Result<()> { + // cookie == 0 indicates SkipCookieCheck was requested + if cookie.0 != 0 { + // Try to read the needle's cookie from the EC shards to validate + // Look up the needle in ecx index to find its offset, then read header from shard + if let Ok(Some((offset, size))) = self.find_needle_from_ecx(needle_id) { + if !size.is_deleted() && !offset.is_zero() { + let actual_offset = offset.to_actual_offset() as u64; + // Determine which shard contains this offset and read the cookie + let shard_size = self + .shards + .iter() + .filter_map(|s| s.as_ref()) + .map(|s| s.file_size()) + .next() + .unwrap_or(0) as u64; + if shard_size > 0 { + let shard_id = (actual_offset / shard_size) as usize; + let shard_offset = actual_offset % shard_size; + if let Some(Some(shard)) = self.shards.get(shard_id) { + let mut header_buf = [0u8; 4]; // cookie is first 4 bytes of needle + if shard.read_at(&mut header_buf, shard_offset).is_ok() { + let needle_cookie = + crate::storage::types::Cookie(u32::from_be_bytes(header_buf)); + if needle_cookie != cookie { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected cookie {:x}", cookie.0), + )); + } + } + } + } + } + } + } + self.journal_delete(needle_id) + } + + /// Read all deleted needle IDs from the .ecj journal. + pub fn read_deleted_needles(&self) -> io::Result> { + let ecj_path = self.ecj_file_name(); + if !std::path::Path::new(&ecj_path).exists() { + return Ok(Vec::new()); + } + + let data = fs::read(&ecj_path)?; + let count = data.len() / NEEDLE_ID_SIZE; + let mut needles = Vec::with_capacity(count); + for i in 0..count { + let start = i * NEEDLE_ID_SIZE; + let id = NeedleId::from_bytes(&data[start..start + NEEDLE_ID_SIZE]); + needles.push(id); + } + Ok(needles) + } + + // ---- Lifecycle ---- + + pub fn close(&mut self) { + for shard in &mut self.shards { + if let Some(s) = shard { + s.close(); + } + *shard = None; + } + // Sync .ecx before closing to flush in-place deletion marks (matches Go's ev.ecxFile.Sync()) + if let Some(ref ecx_file) = self.ecx_file { + let _ = ecx_file.sync_all(); + } + self.ecx_file = None; + self.ecj_file = None; + } + + pub fn destroy(&mut self) { + for shard in &mut self.shards { + if let Some(s) = shard { + s.destroy(); + } + *shard = None; + } + // Remove .ecx/.ecj/.vif from ecx_actual_dir (where they were found) + // Go's Destroy() removes .ecx, .ecj, and .vif files. + let actual_base = crate::storage::volume::volume_file_name( + &self.ecx_actual_dir, + &self.collection, + self.volume_id, + ); + let _ = fs::remove_file(format!("{}.ecx", actual_base)); + let _ = fs::remove_file(format!("{}.ecj", actual_base)); + let _ = fs::remove_file(format!("{}.vif", actual_base)); + // Also try the configured idx dir and data dir in case files exist in either + if self.ecx_actual_dir != self.dir_idx { + let _ = fs::remove_file(self.ecx_file_name()); + let _ = fs::remove_file(self.ecj_file_name()); + let idx_base = crate::storage::volume::volume_file_name( + &self.dir_idx, + &self.collection, + self.volume_id, + ); + let _ = fs::remove_file(format!("{}.vif", idx_base)); + } + if self.ecx_actual_dir != self.dir && self.dir_idx != self.dir { + let data_base = crate::storage::volume::volume_file_name( + &self.dir, + &self.collection, + self.volume_id, + ); + let _ = fs::remove_file(format!("{}.ecx", data_base)); + let _ = fs::remove_file(format!("{}.ecj", data_base)); + let _ = fs::remove_file(format!("{}.vif", data_base)); + } + self.ecx_file = None; + self.ecj_file = None; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn write_ecx_file( + dir: &str, + collection: &str, + vid: VolumeId, + entries: &[(NeedleId, Offset, Size)], + ) { + let base = crate::storage::volume::volume_file_name(dir, collection, vid); + let ecx_path = format!("{}.ecx", base); + let mut file = File::create(&ecx_path).unwrap(); + + // Write sorted entries + for &(key, offset, size) in entries { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, key, offset, size); + file.write_all(&buf).unwrap(); + } + } + + #[test] + fn test_ec_volume_find_needle() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Write sorted ecx entries + let entries = vec![ + (NeedleId(1), Offset::from_actual_offset(8), Size(100)), + (NeedleId(5), Offset::from_actual_offset(200), Size(200)), + (NeedleId(10), Offset::from_actual_offset(500), Size(300)), + ]; + write_ecx_file(dir, "", VolumeId(1), &entries); + + let vol = EcVolume::new(dir, dir, "", VolumeId(1)).unwrap(); + + // Found + let result = vol.find_needle_from_ecx(NeedleId(5)).unwrap(); + assert!(result.is_some()); + let (offset, size) = result.unwrap(); + assert_eq!(offset.to_actual_offset(), 200); + assert_eq!(size, Size(200)); + + // Not found + let result = vol.find_needle_from_ecx(NeedleId(7)).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_ec_volume_journal() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Need ecx file for EcVolume::new to succeed + write_ecx_file(dir, "", VolumeId(1), &[]); + + let mut vol = EcVolume::new(dir, dir, "", VolumeId(1)).unwrap(); + + vol.journal_delete(NeedleId(10)).unwrap(); + vol.journal_delete(NeedleId(20)).unwrap(); + + let deleted = vol.read_deleted_needles().unwrap(); + assert_eq!(deleted, vec![NeedleId(10), NeedleId(20)]); + } + + #[test] + fn test_ec_volume_shard_bits() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + write_ecx_file(dir, "", VolumeId(1), &[]); + + let mut vol = EcVolume::new(dir, dir, "", VolumeId(1)).unwrap(); + assert_eq!(vol.shard_count(), 0); + + // Create a shard file so we can add it + let mut shard = EcVolumeShard::new(dir, "", VolumeId(1), 3); + shard.create().unwrap(); + shard.write_all(&[0u8; 100]).unwrap(); + shard.close(); + + vol.add_shard(EcVolumeShard::new(dir, "", VolumeId(1), 3)) + .unwrap(); + assert_eq!(vol.shard_count(), 1); + assert!(vol.shard_bits().has_shard_id(3)); + } + + #[test] + fn test_ec_volume_uses_collection_prefixed_vif_config() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + write_ecx_file(dir, "pics", VolumeId(1), &[]); + + let vif = crate::storage::volume::VifVolumeInfo { + ec_shard_config: Some(crate::storage::volume::VifEcShardConfig { + data_shards: 6, + parity_shards: 3, + }), + ..Default::default() + }; + let base = crate::storage::volume::volume_file_name(dir, "pics", VolumeId(1)); + std::fs::write( + format!("{}.vif", base), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let vol = EcVolume::new(dir, dir, "pics", VolumeId(1)).unwrap(); + assert_eq!(vol.data_shards, 6); + assert_eq!(vol.parity_shards, 3); + } + + #[test] + fn test_ec_volume_invalid_vif_config_falls_back_to_defaults() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + write_ecx_file(dir, "pics", VolumeId(1), &[]); + + let vif = crate::storage::volume::VifVolumeInfo { + ec_shard_config: Some(crate::storage::volume::VifEcShardConfig { + data_shards: 10, + parity_shards: 10, + }), + ..Default::default() + }; + let base = crate::storage::volume::volume_file_name(dir, "pics", VolumeId(1)); + std::fs::write( + format!("{}.vif", base), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let vol = EcVolume::new(dir, dir, "pics", VolumeId(1)).unwrap(); + assert_eq!(vol.data_shards, DATA_SHARDS_COUNT as u32); + assert_eq!(vol.parity_shards, PARITY_SHARDS_COUNT as u32); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/mod.rs b/seaweed-volume/src/storage/erasure_coding/mod.rs new file mode 100644 index 000000000..b6c07b450 --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/mod.rs @@ -0,0 +1,16 @@ +//! Erasure coding module for volume data protection. +//! +//! Encodes a volume's .dat file into 10 data + 4 parity shards using +//! Reed-Solomon erasure coding. Can reconstruct from any 10 of 14 shards. + +pub mod ec_decoder; +pub mod ec_encoder; +pub mod ec_locate; +pub mod ec_shard; +pub mod ec_volume; + +pub use ec_shard::{ + EcVolumeShard, ShardId, DATA_SHARDS_COUNT, MAX_SHARD_COUNT, MIN_TOTAL_DISKS, + PARITY_SHARDS_COUNT, TOTAL_SHARDS_COUNT, +}; +pub use ec_volume::EcVolume; diff --git a/seaweed-volume/src/storage/idx/mod.rs b/seaweed-volume/src/storage/idx/mod.rs new file mode 100644 index 000000000..f8d556739 --- /dev/null +++ b/seaweed-volume/src/storage/idx/mod.rs @@ -0,0 +1,116 @@ +//! Index file (.idx) format: sequential 17-byte entries. +//! +//! Each entry: NeedleId(8) + Offset(5) + Size(4) = 17 bytes. + +use crate::storage::types::*; +use std::io::{self, Read, Seek, SeekFrom}; + +const ROWS_TO_READ: usize = 1024; + +/// Walk all entries in an .idx file, calling `f` for each. +/// Mirrors Go's `WalkIndexFile()`. +pub fn walk_index_file(reader: &mut R, start_from: u64, mut f: F) -> io::Result<()> +where + R: Read + Seek, + F: FnMut(NeedleId, Offset, Size) -> io::Result<()>, +{ + let reader_offset = start_from * NEEDLE_MAP_ENTRY_SIZE as u64; + reader.seek(SeekFrom::Start(reader_offset))?; + + let mut buf = vec![0u8; NEEDLE_MAP_ENTRY_SIZE * ROWS_TO_READ]; + + loop { + let count = match reader.read(&mut buf) { + Ok(0) => return Ok(()), + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(()), + Err(e) => return Err(e), + }; + + let mut i = 0; + while i + NEEDLE_MAP_ENTRY_SIZE <= count { + let (key, offset, size) = idx_entry_from_bytes(&buf[i..i + NEEDLE_MAP_ENTRY_SIZE]); + f(key, offset, size)?; + i += NEEDLE_MAP_ENTRY_SIZE; + } + } +} + +/// Write a single index entry to a writer. +pub fn write_index_entry( + writer: &mut W, + key: NeedleId, + offset: Offset, + size: Size, +) -> io::Result<()> { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, key, offset, size); + writer.write_all(&buf) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_walk_index_file() { + // Create a small index with 3 entries + let mut data = Vec::new(); + let entries = vec![ + (NeedleId(1), Offset::from_actual_offset(0), Size(100)), + (NeedleId(2), Offset::from_actual_offset(128), Size(200)), + (NeedleId(3), Offset::from_actual_offset(384), Size(300)), + ]; + for (key, offset, size) in &entries { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, *key, *offset, *size); + data.extend_from_slice(&buf); + } + + let mut cursor = Cursor::new(data); + let mut collected = Vec::new(); + walk_index_file(&mut cursor, 0, |key, offset, size| { + collected.push((key, offset.to_actual_offset(), size)); + Ok(()) + }) + .unwrap(); + + assert_eq!(collected.len(), 3); + assert_eq!(collected[0].0, NeedleId(1)); + assert_eq!(collected[0].1, 0); + assert_eq!(collected[0].2, Size(100)); + assert_eq!(collected[1].0, NeedleId(2)); + assert_eq!(collected[2].0, NeedleId(3)); + } + + #[test] + fn test_walk_empty() { + let mut cursor = Cursor::new(Vec::new()); + let mut count = 0; + walk_index_file(&mut cursor, 0, |_, _, _| { + count += 1; + Ok(()) + }) + .unwrap(); + assert_eq!(count, 0); + } + + #[test] + fn test_write_index_entry() { + let mut buf = Vec::new(); + write_index_entry( + &mut buf, + NeedleId(42), + Offset::from_actual_offset(8 * 10), + Size(512), + ) + .unwrap(); + assert_eq!(buf.len(), NEEDLE_MAP_ENTRY_SIZE); + + let (key, offset, size) = idx_entry_from_bytes(&buf); + assert_eq!(key, NeedleId(42)); + assert_eq!(offset.to_actual_offset(), 80); + assert_eq!(size, Size(512)); + } +} diff --git a/seaweed-volume/src/storage/mod.rs b/seaweed-volume/src/storage/mod.rs new file mode 100644 index 000000000..2507c7511 --- /dev/null +++ b/seaweed-volume/src/storage/mod.rs @@ -0,0 +1,9 @@ +pub mod disk_location; +pub mod erasure_coding; +pub mod idx; +pub mod needle; +pub mod needle_map; +pub mod store; +pub mod super_block; +pub mod types; +pub mod volume; diff --git a/seaweed-volume/src/storage/needle/crc.rs b/seaweed-volume/src/storage/needle/crc.rs new file mode 100644 index 000000000..6225c8495 --- /dev/null +++ b/seaweed-volume/src/storage/needle/crc.rs @@ -0,0 +1,73 @@ +//! CRC32-Castagnoli checksum for needle data integrity. +//! +//! Matches Go's `crc32.MakeTable(crc32.Castagnoli)` exactly. +//! The CRC is stored as raw u32 (not the `.Value()` legacy transform). + +/// CRC32-Castagnoli checksum wrapper. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct CRC(pub u32); + +impl CRC { + /// Compute CRC from a byte slice (starting from 0). + pub fn new(data: &[u8]) -> Self { + CRC(0).update(data) + } + + /// Update the CRC with additional bytes. + pub fn update(self, data: &[u8]) -> Self { + CRC(crc32c::crc32c_append(self.0, data)) + } + + /// Legacy `.Value()` function — deprecated in Go but needed for backward compat check. + /// Formula: (crc >> 15 | crc << 17) + 0xa282ead8 + pub fn legacy_value(&self) -> u32 { + (self.0 >> 15 | self.0 << 17).wrapping_add(0xa282ead8) + } +} + +impl From for CRC { + fn from(v: u32) -> Self { + CRC(v) + } +} + +impl From for u32 { + fn from(c: CRC) -> Self { + c.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_crc_empty() { + let crc = CRC::new(&[]); + assert_eq!(crc.0, 0); + } + + #[test] + fn test_crc_known_value() { + // CRC32-C of "hello" — verify it produces a non-zero deterministic value + let crc = CRC::new(b"hello"); + assert_ne!(crc.0, 0); + // Same input produces same output + assert_eq!(crc, CRC::new(b"hello")); + } + + #[test] + fn test_crc_incremental() { + let crc1 = CRC::new(b"hello world"); + let crc2 = CRC::new(b"hello").update(b" world"); + assert_eq!(crc1, crc2); + } + + #[test] + fn test_crc_legacy_value() { + let crc = CRC(0x12345678); + let v = crc.legacy_value(); + let expected = (0x12345678u32 >> 15 | 0x12345678u32 << 17).wrapping_add(0xa282ead8); + assert_eq!(v, expected); + } +} diff --git a/seaweed-volume/src/storage/needle/mod.rs b/seaweed-volume/src/storage/needle/mod.rs new file mode 100644 index 000000000..364c6122a --- /dev/null +++ b/seaweed-volume/src/storage/needle/mod.rs @@ -0,0 +1,7 @@ +pub mod crc; +pub mod needle; +pub mod ttl; + +pub use crc::CRC; +pub use needle::Needle; +pub use ttl::TTL; diff --git a/seaweed-volume/src/storage/needle/needle.rs b/seaweed-volume/src/storage/needle/needle.rs new file mode 100644 index 000000000..bbc55c9d0 --- /dev/null +++ b/seaweed-volume/src/storage/needle/needle.rs @@ -0,0 +1,944 @@ +//! Needle: the individual file object stored in a volume. +//! +//! Binary format (Version 2/3): +//! Header (16 bytes): Cookie(4) + NeedleId(8) + Size(4) +//! Body (Size bytes): +//! DataSize(4) + Data(DataSize) + Flags(1) +//! [if HasName]: NameSize(1) + Name(NameSize) +//! [if HasMime]: MimeSize(1) + Mime(MimeSize) +//! [if HasLastMod]: LastModified(5) +//! [if HasTtl]: TTL(2) +//! [if HasPairs]: PairsSize(2) + Pairs(PairsSize) +//! Tail: +//! Checksum(4) + [if V3: AppendAtNs(8)] + Padding(0-7) + +use super::crc::CRC; +use super::ttl::TTL; +use crate::storage::types::*; + +// Flag bits (matching Go constants) +pub const FLAG_IS_COMPRESSED: u8 = 0x01; +pub const FLAG_HAS_NAME: u8 = 0x02; +pub const FLAG_HAS_MIME: u8 = 0x04; +pub const FLAG_HAS_LAST_MODIFIED_DATE: u8 = 0x08; +pub const FLAG_HAS_TTL: u8 = 0x10; +pub const FLAG_HAS_PAIRS: u8 = 0x20; +pub const FLAG_IS_CHUNK_MANIFEST: u8 = 0x80; + +pub const LAST_MODIFIED_BYTES_LENGTH: usize = 5; +pub const TTL_BYTES_LENGTH: usize = 2; + +#[derive(Debug, Clone, Default)] +pub struct Needle { + pub cookie: Cookie, + pub id: NeedleId, + pub size: Size, // sum of body content fields + + // Version 2+ fields + pub data_size: u32, + pub data: Vec, + pub flags: u8, + pub name_size: u8, + pub name: Vec, // max 255 bytes + pub mime_size: u8, + pub mime: Vec, // max 255 bytes + pub pairs_size: u16, + pub pairs: Vec, // max 64KB, JSON + pub last_modified: u64, // stored as 5 bytes on disk + pub ttl: Option, + + // Tail fields + pub checksum: CRC, + pub append_at_ns: u64, // Version 3 only + pub padding: Vec, +} + +impl Needle { + // ---- Flag accessors (matching Go) ---- + + pub fn is_compressed(&self) -> bool { + self.flags & FLAG_IS_COMPRESSED != 0 + } + pub fn set_is_compressed(&mut self) { + self.flags |= FLAG_IS_COMPRESSED; + } + + pub fn has_name(&self) -> bool { + self.flags & FLAG_HAS_NAME != 0 + } + pub fn set_has_name(&mut self) { + self.flags |= FLAG_HAS_NAME; + } + + pub fn has_mime(&self) -> bool { + self.flags & FLAG_HAS_MIME != 0 + } + pub fn set_has_mime(&mut self) { + self.flags |= FLAG_HAS_MIME; + } + + pub fn has_last_modified_date(&self) -> bool { + self.flags & FLAG_HAS_LAST_MODIFIED_DATE != 0 + } + pub fn set_has_last_modified_date(&mut self) { + self.flags |= FLAG_HAS_LAST_MODIFIED_DATE; + } + + pub fn has_ttl(&self) -> bool { + self.flags & FLAG_HAS_TTL != 0 + } + pub fn set_has_ttl(&mut self) { + self.flags |= FLAG_HAS_TTL; + } + + pub fn has_pairs(&self) -> bool { + self.flags & FLAG_HAS_PAIRS != 0 + } + pub fn set_has_pairs(&mut self) { + self.flags |= FLAG_HAS_PAIRS; + } + + pub fn is_chunk_manifest(&self) -> bool { + self.flags & FLAG_IS_CHUNK_MANIFEST != 0 + } + pub fn set_is_chunk_manifest(&mut self) { + self.flags |= FLAG_IS_CHUNK_MANIFEST; + } + + // ---- Header parsing ---- + + /// Parse the 16-byte needle header. + pub fn parse_header(bytes: &[u8]) -> (Cookie, NeedleId, Size) { + assert!(bytes.len() >= NEEDLE_HEADER_SIZE); + let cookie = Cookie::from_bytes(&bytes[0..COOKIE_SIZE]); + let id = NeedleId::from_bytes(&bytes[COOKIE_SIZE..COOKIE_SIZE + NEEDLE_ID_SIZE]); + let size = Size::from_bytes(&bytes[COOKIE_SIZE + NEEDLE_ID_SIZE..NEEDLE_HEADER_SIZE]); + (cookie, id, size) + } + + /// Parse needle header into self. + pub fn read_header(&mut self, bytes: &[u8]) { + let (cookie, id, size) = Self::parse_header(bytes); + self.cookie = cookie; + self.id = id; + self.size = size; + } + + // ---- Body reading (Version 2/3) ---- + + /// Read version 2/3 body metadata only — skips copying the data payload. + /// Sets `data_size` and all metadata fields but leaves `data` empty. + pub fn read_body_v2_meta_only(&mut self, bytes: &[u8]) -> Result<(), NeedleError> { + let len_bytes = bytes.len(); + let mut index = 0; + + // DataSize (4 bytes) + if index + 4 > len_bytes { + return Err(NeedleError::IndexOutOfRange(1)); + } + self.data_size = u32::from_be_bytes([ + bytes[index], + bytes[index + 1], + bytes[index + 2], + bytes[index + 3], + ]); + index += 4; + + // Skip data bytes (do NOT copy them) + if index + self.data_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(1)); + } + index += self.data_size as usize; + + // Read non-data metadata + self.read_body_v2_non_data(&bytes[index..])?; + Ok(()) + } + + /// Read full needle from bytes but skip copying the data payload. + /// Sets all metadata fields, checksum, etc. but leaves `data` empty. + pub fn read_bytes_meta_only( + &mut self, + bytes: &[u8], + offset: i64, + expected_size: Size, + version: Version, + ) -> Result<(), NeedleError> { + self.read_header(bytes); + + if self.size != expected_size { + return Err(NeedleError::SizeMismatch { + offset, + id: self.id, + found: self.size, + expected: expected_size, + }); + } + + let body_start = NEEDLE_HEADER_SIZE; + let body_end = body_start + self.size.0 as usize; + + if version == VERSION_1 { + // V1 has no metadata — data is the entire body + self.data_size = self.size.0 as u32; + } else if self.size.0 == 0 { + // Tombstones have no DataSize/body section; metadata starts at the tail. + self.data_size = 0; + } else { + self.read_body_v2_meta_only(&bytes[body_start..body_end])?; + } + + // Read tail but skip CRC validation (no data to check against) + self.read_tail_meta_only(&bytes[body_end..], version)?; + Ok(()) + } + + /// Paged meta-only parse: accepts the 20-byte header+DataSize prefix and the + /// meta tail bytes (everything after the data payload). This avoids reading + /// the data payload from disk at all, matching Go's `ReadNeedleMeta`. + pub fn read_paged_meta( + &mut self, + header_bytes: &[u8], // first 20 bytes: NEEDLE_HEADER_SIZE + DATA_SIZE_SIZE + meta_bytes: &[u8], // tail: non-data body metadata + checksum + timestamp + padding + offset: i64, + expected_size: Size, + version: Version, + ) -> Result<(), NeedleError> { + // Parse the 16-byte header + self.read_header(header_bytes); + + if self.size != expected_size { + return Err(NeedleError::SizeMismatch { + offset, + id: self.id, + found: self.size, + expected: expected_size, + }); + } + + if version == VERSION_1 { + self.data_size = self.size.0 as u32; + } else if self.size.0 == 0 { + // Tombstone + self.data_size = 0; + } else { + // Extract DataSize from bytes 16..20 + self.data_size = u32::from_be_bytes([ + header_bytes[NEEDLE_HEADER_SIZE], + header_bytes[NEEDLE_HEADER_SIZE + 1], + header_bytes[NEEDLE_HEADER_SIZE + 2], + header_bytes[NEEDLE_HEADER_SIZE + 3], + ]); + + // meta_bytes starts with the non-data body metadata (flags, name, mime, etc.) + // followed by the tail (checksum + timestamp + padding). + // readNeedleDataVersion2NonData returns the index where it stopped. + let index = self.read_body_v2_non_data(meta_bytes)?; + self.read_tail_meta_only(&meta_bytes[index..], version)?; + return Ok(()); + } + + // For VERSION_1 or tombstones, meta_bytes IS the tail + self.read_tail_meta_only(meta_bytes, version)?; + Ok(()) + } + + /// Read tail without CRC validation (used when data was not read). + fn read_tail_meta_only( + &mut self, + tail_bytes: &[u8], + version: Version, + ) -> Result<(), NeedleError> { + if tail_bytes.len() < NEEDLE_CHECKSUM_SIZE { + return Err(NeedleError::TailTooShort); + } + + self.checksum = CRC(u32::from_be_bytes([ + tail_bytes[0], + tail_bytes[1], + tail_bytes[2], + tail_bytes[3], + ])); + + if version == VERSION_3 { + let ts_offset = NEEDLE_CHECKSUM_SIZE; + if tail_bytes.len() < ts_offset + TIMESTAMP_SIZE { + return Err(NeedleError::TailTooShort); + } + self.append_at_ns = u64::from_be_bytes([ + tail_bytes[ts_offset], + tail_bytes[ts_offset + 1], + tail_bytes[ts_offset + 2], + tail_bytes[ts_offset + 3], + tail_bytes[ts_offset + 4], + tail_bytes[ts_offset + 5], + tail_bytes[ts_offset + 6], + tail_bytes[ts_offset + 7], + ]); + } + + Ok(()) + } + + /// Read the version 2/3 body data from bytes (size bytes starting after header). + /// Returns IndexOutOfRange errors for truncated data (matching Go's readNeedleDataVersion2). + pub fn read_body_v2(&mut self, bytes: &[u8]) -> Result<(), NeedleError> { + let len_bytes = bytes.len(); + let mut index = 0; + + // DataSize (4 bytes) + if index + 4 > len_bytes { + return Ok(()); // tolerate EOF + } + self.data_size = u32::from_be_bytes([ + bytes[index], + bytes[index + 1], + bytes[index + 2], + bytes[index + 3], + ]); + index += 4; + + // Data + if index + self.data_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(1)); + } + self.data = bytes[index..index + self.data_size as usize].to_vec(); + index += self.data_size as usize; + + // Read non-data metadata + self.read_body_v2_non_data(&bytes[index..])?; + Ok(()) + } + + /// Read version 2/3 metadata fields (everything after Data). + /// Returns IndexOutOfRange errors for truncated data (matching Go's readNeedleDataVersion2). + fn read_body_v2_non_data(&mut self, bytes: &[u8]) -> Result { + let len_bytes = bytes.len(); + let mut index = 0; + + // Flags (1 byte) + if index < len_bytes { + self.flags = bytes[index]; + index += 1; + } else { + return Ok(index); + } + + // Name + if index < len_bytes && self.has_name() { + self.name_size = bytes[index]; + index += 1; + if index + self.name_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(2)); + } + self.name = bytes[index..index + self.name_size as usize].to_vec(); + index += self.name_size as usize; + } + + // Mime + if index < len_bytes && self.has_mime() { + self.mime_size = bytes[index]; + index += 1; + if index + self.mime_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(3)); + } + self.mime = bytes[index..index + self.mime_size as usize].to_vec(); + index += self.mime_size as usize; + } + + // LastModified (5 bytes) + if index < len_bytes && self.has_last_modified_date() { + if index + LAST_MODIFIED_BYTES_LENGTH > len_bytes { + return Err(NeedleError::IndexOutOfRange(4)); + } + self.last_modified = bytes_to_u64_5(&bytes[index..index + LAST_MODIFIED_BYTES_LENGTH]); + index += LAST_MODIFIED_BYTES_LENGTH; + } + + // TTL (2 bytes) + if index < len_bytes && self.has_ttl() { + if index + TTL_BYTES_LENGTH > len_bytes { + return Err(NeedleError::IndexOutOfRange(5)); + } + self.ttl = Some(TTL::from_bytes(&bytes[index..index + TTL_BYTES_LENGTH])); + index += TTL_BYTES_LENGTH; + } + + // Pairs + if index < len_bytes && self.has_pairs() { + if index + 2 > len_bytes { + return Err(NeedleError::IndexOutOfRange(6)); + } + self.pairs_size = u16::from_be_bytes([bytes[index], bytes[index + 1]]); + index += 2; + if index + self.pairs_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(7)); + } + self.pairs = bytes[index..index + self.pairs_size as usize].to_vec(); + index += self.pairs_size as usize; + } + + Ok(index) + } + + // ---- Tail reading ---- + + /// Read the needle tail (checksum + optional timestamp + padding). + pub fn read_tail(&mut self, tail_bytes: &[u8], version: Version) -> Result<(), NeedleError> { + if tail_bytes.len() < NEEDLE_CHECKSUM_SIZE { + return Err(NeedleError::TailTooShort); + } + + let expected_checksum = CRC(u32::from_be_bytes([ + tail_bytes[0], + tail_bytes[1], + tail_bytes[2], + tail_bytes[3], + ])); + + if !self.data.is_empty() { + let data_checksum = CRC::new(&self.data); + // Go double-checks: n.Checksum != crc && uint32(n.Checksum) != crc.Value() + // The crc.Value() path is a deprecated legacy transform for backward compat + // with seaweed versions prior to commit 056c480eb. + if expected_checksum != data_checksum + && expected_checksum.0 != data_checksum.legacy_value() + { + return Err(NeedleError::CrcMismatch { + needle_id: self.id, + got: data_checksum.0, + want: expected_checksum.0, + }); + } + self.checksum = data_checksum; + } else { + self.checksum = expected_checksum; + } + + if version == VERSION_3 { + let ts_offset = NEEDLE_CHECKSUM_SIZE; + if tail_bytes.len() < ts_offset + TIMESTAMP_SIZE { + return Err(NeedleError::TailTooShort); + } + self.append_at_ns = u64::from_be_bytes([ + tail_bytes[ts_offset], + tail_bytes[ts_offset + 1], + tail_bytes[ts_offset + 2], + tail_bytes[ts_offset + 3], + tail_bytes[ts_offset + 4], + tail_bytes[ts_offset + 5], + tail_bytes[ts_offset + 6], + tail_bytes[ts_offset + 7], + ]); + } + + Ok(()) + } + + // ---- Full read from bytes ---- + + /// Read a complete needle from its raw bytes (header + body + tail). + pub fn read_bytes( + &mut self, + bytes: &[u8], + offset: i64, + expected_size: Size, + version: Version, + ) -> Result<(), NeedleError> { + self.read_header(bytes); + + if self.size != expected_size { + return Err(NeedleError::SizeMismatch { + offset, + id: self.id, + found: self.size, + expected: expected_size, + }); + } + + let body_start = NEEDLE_HEADER_SIZE; + let body_end = body_start + self.size.0 as usize; + + if version == VERSION_1 { + self.data = bytes[body_start..body_end].to_vec(); + } else { + self.read_body_v2(&bytes[body_start..body_end])?; + } + + self.read_tail(&bytes[body_end..], version)?; + Ok(()) + } + + // ---- Write (serialize) ---- + + /// Serialize the needle to bytes for writing to a .dat file (Version 2/3). + pub fn write_bytes(&mut self, version: Version) -> Vec { + let mut buf = Vec::with_capacity(256); + + // Compute sizes (matching Go writeNeedleCommon) + if self.name.len() >= 255 { + self.name_size = 255; + } else { + self.name_size = self.name.len() as u8; + } + self.data_size = self.data.len() as u32; + self.mime_size = self.mime.len() as u8; + + // Compute n.Size (body size, excluding header) + if self.data_size > 0 { + let mut s: i32 = 4 + self.data_size as i32 + 1; // DataSize + Data + Flags + if self.has_name() { + s += 1 + self.name_size as i32; + } + if self.has_mime() { + s += 1 + self.mime_size as i32; + } + if self.has_last_modified_date() { + s += LAST_MODIFIED_BYTES_LENGTH as i32; + } + if self.has_ttl() { + s += TTL_BYTES_LENGTH as i32; + } + if self.has_pairs() { + s += 2 + self.pairs_size as i32; + } + self.size = Size(s); + } else { + self.size = Size(0); + } + + // Header: Cookie(4) + NeedleId(8) + Size(4) = 16 bytes + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + self.cookie.to_bytes(&mut header[0..COOKIE_SIZE]); + self.id + .to_bytes(&mut header[COOKIE_SIZE..COOKIE_SIZE + NEEDLE_ID_SIZE]); + self.size + .to_bytes(&mut header[COOKIE_SIZE + NEEDLE_ID_SIZE..NEEDLE_HEADER_SIZE]); + buf.extend_from_slice(&header); + + // Body + if self.data_size > 0 { + buf.extend_from_slice(&self.data_size.to_be_bytes()); + buf.extend_from_slice(&self.data); + buf.push(self.flags); + if self.has_name() { + buf.push(self.name_size); + buf.extend_from_slice(&self.name[..self.name_size as usize]); + } + if self.has_mime() { + buf.push(self.mime_size); + buf.extend_from_slice(&self.mime); + } + if self.has_last_modified_date() { + // Write 5 bytes of last_modified (lower 5 bytes of u64 big-endian) + let lm_bytes = self.last_modified.to_be_bytes(); + buf.extend_from_slice(&lm_bytes[8 - LAST_MODIFIED_BYTES_LENGTH..8]); + } + if self.has_ttl() { + if let Some(ref ttl) = self.ttl { + let mut ttl_buf = [0u8; 2]; + ttl.to_bytes(&mut ttl_buf); + buf.extend_from_slice(&ttl_buf); + } else { + buf.extend_from_slice(&[0u8; 2]); + } + } + if self.has_pairs() { + buf.extend_from_slice(&self.pairs_size.to_be_bytes()); + buf.extend_from_slice(&self.pairs); + } + } + + // Compute checksum + self.checksum = CRC::new(&self.data); + + // Tail: Checksum + [V3: AppendAtNs] + Padding + buf.extend_from_slice(&self.checksum.0.to_be_bytes()); + if version == VERSION_3 { + buf.extend_from_slice(&self.append_at_ns.to_be_bytes()); + } + + // Padding to 8-byte alignment + let padding = padding_length(self.size, version).0 as usize; + buf.extend(std::iter::repeat(0u8).take(padding)); + + buf + } + + /// Total disk size of this needle including header, body, checksum, timestamp, and padding. + pub fn disk_size(&self, version: Version) -> i64 { + get_actual_size(self.size, version) + } + + /// Compute ETag string from checksum (matching Go). + pub fn etag(&self) -> String { + etag_from_checksum(self.checksum.0) + } +} + +// ============================================================================ +// Helper functions (matching Go) +// ============================================================================ + +/// Compute padding to align needle to NEEDLE_PADDING_SIZE (8 bytes). +pub fn padding_length(needle_size: Size, version: Version) -> Size { + if version == VERSION_3 { + Size( + NEEDLE_PADDING_SIZE as i32 + - ((NEEDLE_HEADER_SIZE as i32 + + needle_size.0 + + NEEDLE_CHECKSUM_SIZE as i32 + + TIMESTAMP_SIZE as i32) + % NEEDLE_PADDING_SIZE as i32), + ) + } else { + Size( + NEEDLE_PADDING_SIZE as i32 + - ((NEEDLE_HEADER_SIZE as i32 + needle_size.0 + NEEDLE_CHECKSUM_SIZE as i32) + % NEEDLE_PADDING_SIZE as i32), + ) + } +} + +/// Body length = Size + Checksum + [Timestamp] + Padding. +pub fn needle_body_length(needle_size: Size, version: Version) -> i64 { + if version == VERSION_3 { + needle_size.0 as i64 + + NEEDLE_CHECKSUM_SIZE as i64 + + TIMESTAMP_SIZE as i64 + + padding_length(needle_size, version).0 as i64 + } else { + needle_size.0 as i64 + + NEEDLE_CHECKSUM_SIZE as i64 + + padding_length(needle_size, version).0 as i64 + } +} + +/// Total actual size on disk: Header + Body. +pub fn get_actual_size(size: Size, version: Version) -> i64 { + NEEDLE_HEADER_SIZE as i64 + needle_body_length(size, version) +} + +/// Read 5 bytes as a u64 (big-endian, zero-padded high bytes). +fn bytes_to_u64_5(bytes: &[u8]) -> u64 { + assert!(bytes.len() >= 5); + // The 5 bytes are the LOWER 5 bytes of a big-endian u64. + // In Go: util.BytesToUint64(bytes[index : index+5]) reads into a uint64 + // Go's BytesToUint64 copies into the LAST 5 bytes of an 8-byte array (big-endian). + let mut buf = [0u8; 8]; + buf[3..8].copy_from_slice(&bytes[..5]); + u64::from_be_bytes(buf) +} + +/// ETag formatted as Go: hex of big-endian u32 bytes. +pub fn etag_from_checksum(checksum: u32) -> String { + let bits = checksum.to_be_bytes(); + format!( + "{:02x}{:02x}{:02x}{:02x}", + bits[0], bits[1], bits[2], bits[3] + ) +} + +// ============================================================================ +// FileId +// ============================================================================ + +/// FileId = VolumeId + NeedleId + Cookie. +/// String format: "," +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileId { + pub volume_id: VolumeId, + pub key: NeedleId, + pub cookie: Cookie, +} + +impl FileId { + pub fn new(volume_id: VolumeId, key: NeedleId, cookie: Cookie) -> Self { + FileId { + volume_id, + key, + cookie, + } + } + + /// Parse "volume_id,needle_id_cookie" or "volume_id/needle_id_cookie". + pub fn parse(s: &str) -> Result { + let (vid_str, rest) = if let Some(pos) = s.find(',') { + (&s[..pos], &s[pos + 1..]) + } else if let Some(pos) = s.find('/') { + (&s[..pos], &s[pos + 1..]) + } else { + return Err(format!("invalid file id: {}", s)); + }; + + let volume_id = + VolumeId::parse(vid_str).map_err(|e| format!("invalid volume id: {}", e))?; + let (key, cookie) = parse_needle_id_cookie(rest)?; + Ok(FileId { + volume_id, + key, + cookie, + }) + } + + /// Format the needle_id + cookie part as a hex string (stripping leading zeros). + pub fn needle_id_cookie_string(&self) -> String { + format_needle_id_cookie(self.key, self.cookie) + } +} + +impl std::fmt::Display for FileId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{},{}", self.volume_id, self.needle_id_cookie_string()) + } +} + +/// Format NeedleId + Cookie as hex, stripping leading zero bytes from NeedleId only. +/// Matches Go: strips leading zero bytes up to NeedleIdSize (8), so cookie is always present. +fn format_needle_id_cookie(key: NeedleId, cookie: Cookie) -> String { + // Encode 12 bytes: 8 for NeedleId + 4 for Cookie + let mut bytes = [0u8; 12]; + key.to_bytes(&mut bytes[0..8]); + cookie.to_bytes(&mut bytes[8..12]); + + // Strip leading zero bytes, but only within NeedleId portion (first 8 bytes) + let mut nonzero_index = 0; + while nonzero_index < NEEDLE_ID_SIZE && bytes[nonzero_index] == 0 { + nonzero_index += 1; + } + hex::encode(&bytes[nonzero_index..]) +} + +/// Parse "needle_id_cookie_hex" or "needle_id_cookie_hex_delta" into (NeedleId, Cookie). +/// Matches Go's ParsePath + ParseNeedleIdCookie: supports an optional `_delta` suffix +/// where delta is a decimal number added to the NeedleId (used for sub-file addressing). +/// Rejects strings that are too short or too long. +pub fn parse_needle_id_cookie(s: &str) -> Result<(NeedleId, Cookie), String> { + // Go ParsePath: check for "_" suffix containing a decimal delta + let (hex_part, delta) = if let Some(underscore_pos) = s.rfind('_') { + if underscore_pos > 0 { + let delta_str = &s[underscore_pos + 1..]; + let d: u64 = delta_str + .parse() + .map_err(|e| format!("Parse delta error: {}", e))?; + (&s[..underscore_pos], Some(d)) + } else { + (s, None) + } + } else { + (s, None) + }; + + // Go: len(key_hash_string) <= CookieSize*2 => error (must be > 8 hex chars) + if hex_part.len() <= COOKIE_SIZE * 2 { + return Err("KeyHash is too short.".to_string()); + } + // Go: len(key_hash_string) > (NeedleIdSize+CookieSize)*2 => error (must be <= 24 hex chars) + if hex_part.len() > (NEEDLE_ID_SIZE + COOKIE_SIZE) * 2 { + return Err("KeyHash is too long.".to_string()); + } + + // Split: last CookieSize*2 hex chars are cookie, rest is needle id + let split = hex_part.len() - COOKIE_SIZE * 2; + let needle_id_hex = &hex_part[..split]; + let cookie_hex = &hex_part[split..]; + + let needle_id_bytes = hex::decode(needle_id_hex).map_err(|e| format!("Parse needleId error: {}", e))?; + let cookie_bytes = hex::decode(cookie_hex).map_err(|e| format!("Parse cookie error: {}", e))?; + + // Pad needle id to 8 bytes + let mut nid_buf = [0u8; 8]; + if needle_id_bytes.len() > 8 { + return Err(format!("KeyHash is too long.")); + } + let start = 8 - needle_id_bytes.len(); + nid_buf[start..].copy_from_slice(&needle_id_bytes); + + let mut key = NeedleId::from_bytes(&nid_buf[0..8]); + let cookie = Cookie::from_bytes(&cookie_bytes[0..4]); + + // Apply delta if present (Go: n.Id += Uint64ToNeedleId(d)) + if let Some(d) = delta { + key = NeedleId(key.0.wrapping_add(d)); + } + + Ok((key, cookie)) +} + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum NeedleError { + #[error("size mismatch at offset {offset}: found id={id} size={found:?}, expected size={expected:?}")] + SizeMismatch { + offset: i64, + id: NeedleId, + found: Size, + expected: Size, + }, + + #[error("CRC mismatch for needle {needle_id}: got {got:08x}, want {want:08x}")] + CrcMismatch { + needle_id: NeedleId, + got: u32, + want: u32, + }, + + #[error("index out of range ({0})")] + IndexOutOfRange(u32), + + #[error("needle tail too short")] + TailTooShort, + + #[error("unsupported version: {0}")] + UnsupportedVersion(u8), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_header() { + let mut buf = [0u8; NEEDLE_HEADER_SIZE]; + let cookie = Cookie(0xdeadbeef); + let id = NeedleId(0x123456789abcdef0); + let size = Size(1024); + cookie.to_bytes(&mut buf[0..4]); + id.to_bytes(&mut buf[4..12]); + size.to_bytes(&mut buf[12..16]); + + let (c, i, s) = Needle::parse_header(&buf); + assert_eq!(c, cookie); + assert_eq!(i, id); + assert_eq!(s, size); + } + + #[test] + fn test_needle_write_read_round_trip_v3() { + let mut n = Needle::default(); + n.cookie = Cookie(42); + n.id = NeedleId(100); + n.data = b"hello world".to_vec(); + n.flags = 0; + n.set_has_name(); + n.name = b"test.txt".to_vec(); + n.name_size = 8; + n.set_has_mime(); + n.mime = b"text/plain".to_vec(); + n.mime_size = 10; + n.set_has_last_modified_date(); + n.last_modified = 1234567890; + n.set_has_ttl(); + n.ttl = Some(TTL { + count: 5, + unit: super::super::ttl::TTL_UNIT_DAY, + }); + n.append_at_ns = 999999999; + + let bytes = n.write_bytes(VERSION_3); + + // Verify total size matches expected + let expected_size = get_actual_size(n.size, VERSION_3); + assert_eq!(bytes.len() as i64, expected_size); + + // Read it back + let mut n2 = Needle::default(); + n2.read_bytes(&bytes, 0, n.size, VERSION_3).unwrap(); + + assert_eq!(n2.cookie, n.cookie); + assert_eq!(n2.id, n.id); + assert_eq!(n2.data, n.data); + assert_eq!(n2.name, n.name); + assert_eq!(n2.mime, n.mime); + assert_eq!(n2.last_modified, n.last_modified); + assert_eq!(n2.ttl, n.ttl); + assert_eq!(n2.checksum, n.checksum); + assert_eq!(n2.append_at_ns, n.append_at_ns); + } + + #[test] + fn test_needle_write_read_round_trip_v2() { + let mut n = Needle::default(); + n.cookie = Cookie(77); + n.id = NeedleId(200); + n.data = b"data v2".to_vec(); + n.flags = 0; + + let bytes = n.write_bytes(VERSION_2); + let expected_size = get_actual_size(n.size, VERSION_2); + assert_eq!(bytes.len() as i64, expected_size); + + let mut n2 = Needle::default(); + n2.read_bytes(&bytes, 0, n.size, VERSION_2).unwrap(); + + assert_eq!(n2.data, n.data); + assert_eq!(n2.checksum, n.checksum); + } + + #[test] + fn test_read_bytes_meta_only_handles_tombstone_v3() { + let mut tombstone = Needle::default(); + tombstone.cookie = Cookie(0x1234abcd); + tombstone.id = NeedleId(300); + tombstone.append_at_ns = 999_999; + + let bytes = tombstone.write_bytes(VERSION_3); + + let mut meta = Needle::default(); + meta.read_bytes_meta_only(&bytes, 0, Size(0), VERSION_3) + .unwrap(); + + assert_eq!(meta.cookie, tombstone.cookie); + assert_eq!(meta.id, tombstone.id); + assert_eq!(meta.size, Size(0)); + assert_eq!(meta.data_size, 0); + assert_eq!(meta.append_at_ns, tombstone.append_at_ns); + assert_eq!(meta.checksum, tombstone.checksum); + } + + #[test] + fn test_padding_alignment() { + // All actual sizes should be multiples of 8 + for size_val in 0..50 { + let s = Size(size_val); + let actual_v2 = get_actual_size(s, VERSION_2); + let actual_v3 = get_actual_size(s, VERSION_3); + assert_eq!(actual_v2 % 8, 0, "V2 size {} not aligned", size_val); + assert_eq!(actual_v3 % 8, 0, "V3 size {} not aligned", size_val); + } + } + + #[test] + fn test_file_id_parse() { + let fid = FileId::parse("3,01637037d6").unwrap(); + assert_eq!(fid.volume_id, VolumeId(3)); + // The hex "01637037d6" is 5 bytes = 0x0163..., padded to 12 bytes + assert!(!fid.key.is_empty() || !fid.cookie.0 == 0); + } + + #[test] + fn test_file_id_round_trip() { + let fid = FileId::new(VolumeId(5), NeedleId(0x123456), Cookie(0xabcd)); + let s = fid.to_string(); + let fid2 = FileId::parse(&s).unwrap(); + assert_eq!(fid, fid2); + } + + #[test] + fn test_needle_id_cookie_format() { + let s = format_needle_id_cookie(NeedleId(1), Cookie(0x12345678)); + let (key, cookie) = parse_needle_id_cookie(&s).unwrap(); + assert_eq!(key, NeedleId(1)); + assert_eq!(cookie, Cookie(0x12345678)); + } +} diff --git a/seaweed-volume/src/storage/needle/ttl.rs b/seaweed-volume/src/storage/needle/ttl.rs new file mode 100644 index 000000000..f55cb082f --- /dev/null +++ b/seaweed-volume/src/storage/needle/ttl.rs @@ -0,0 +1,302 @@ +//! Time-to-live encoding for needles. +//! +//! TTL is stored as 2 bytes: Count(1) + Unit(1). +//! Supported units: minute(m), hour(h), day(d), week(w), month(M), year(y). + +use std::fmt; + +/// TTL unit constants (matching Go). +pub const TTL_UNIT_EMPTY: u8 = 0; +pub const TTL_UNIT_MINUTE: u8 = 1; +pub const TTL_UNIT_HOUR: u8 = 2; +pub const TTL_UNIT_DAY: u8 = 3; +pub const TTL_UNIT_WEEK: u8 = 4; +pub const TTL_UNIT_MONTH: u8 = 5; +pub const TTL_UNIT_YEAR: u8 = 6; + +pub const TTL_BYTES_LENGTH: usize = 2; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct TTL { + pub count: u8, + pub unit: u8, +} + +impl TTL { + pub const EMPTY: TTL = TTL { count: 0, unit: 0 }; + + pub fn is_empty(&self) -> bool { + self.count == 0 && self.unit == 0 + } + + /// Load from 2 bytes. + pub fn from_bytes(input: &[u8]) -> Self { + if input.len() < 2 { + return TTL::EMPTY; + } + TTL { + count: input[0], + unit: input[1], + } + } + + /// Serialize to 2 bytes. + pub fn to_bytes(&self, output: &mut [u8]) { + assert!(output.len() >= 2); + output[0] = self.count; + output[1] = self.unit; + } + + /// Encode as u32: (count << 8) | unit. + pub fn to_u32(&self) -> u32 { + if self.count == 0 { + return 0; + } + ((self.count as u32) << 8) + (self.unit as u32) + } + + /// Decode from u32. + pub fn from_u32(v: u32) -> Self { + if v == 0 { + return TTL::EMPTY; + } + TTL { + count: (v >> 8) as u8, + unit: (v & 0xFF) as u8, + } + } + + /// Convert to total seconds. + pub fn to_seconds(&self) -> u64 { + unit_to_seconds(self.count as u64, self.unit) + } + + /// Parse from string like "3m", "4h", "5d", "6w", "7M", "8y". + /// If the string is all digits (no unit suffix), defaults to minutes. + /// Matches Go's ReadTTL which calls fitTtlCount to normalize: + /// e.g. "120m" -> 2h, "7d" -> 1w, "24h" -> 1d. + pub fn read(s: &str) -> Result { + let s = s.trim(); + if s.is_empty() { + return Ok(TTL::EMPTY); + } + let last_byte = s.as_bytes()[s.len() - 1]; + let (num_str, unit_byte) = if last_byte >= b'0' && last_byte <= b'9' { + // All digits — default to minutes (matching Go) + (s, b'm') + } else { + (&s[..s.len() - 1], last_byte) + }; + let count: u32 = num_str + .parse() + .map_err(|e| format!("invalid TTL count: {}", e))?; + let unit = match unit_byte { + b'm' => TTL_UNIT_MINUTE, + b'h' => TTL_UNIT_HOUR, + b'd' => TTL_UNIT_DAY, + b'w' => TTL_UNIT_WEEK, + b'M' => TTL_UNIT_MONTH, + b'y' => TTL_UNIT_YEAR, + _ => return Err(format!("unknown TTL unit: {}", unit_byte as char)), + }; + // Match Go's ReadTTL: normalize via fitTtlCount + Ok(fit_ttl_count(count, unit)) + } + + /// Minutes representation. + pub fn minutes(&self) -> u32 { + (self.to_seconds() / 60) as u32 + } +} + +fn unit_to_seconds(count: u64, unit: u8) -> u64 { + match unit { + TTL_UNIT_EMPTY => 0, + TTL_UNIT_MINUTE => count * 60, + TTL_UNIT_HOUR => count * 60 * 60, + TTL_UNIT_DAY => count * 60 * 60 * 24, + TTL_UNIT_WEEK => count * 60 * 60 * 24 * 7, + TTL_UNIT_MONTH => count * 60 * 60 * 24 * 30, + TTL_UNIT_YEAR => count * 60 * 60 * 24 * 365, + _ => 0, + } +} + +/// Fit a count+unit into a TTL that fits in a single byte count. +/// Converts to seconds first, then finds the coarsest unit that fits. +/// Matches Go's fitTtlCount called from ReadTTL. +fn fit_ttl_count(count: u32, unit: u8) -> TTL { + if count == 0 || unit == TTL_UNIT_EMPTY { + return TTL::EMPTY; + } + + // Always convert to seconds and normalize (matches Go). + let seconds = unit_to_seconds(count as u64, unit); + if seconds == 0 { + return TTL::EMPTY; + } + + const YEAR_SECS: u64 = 3600 * 24 * 365; + const MONTH_SECS: u64 = 3600 * 24 * 30; + const WEEK_SECS: u64 = 3600 * 24 * 7; + const DAY_SECS: u64 = 3600 * 24; + const HOUR_SECS: u64 = 3600; + const MINUTE_SECS: u64 = 60; + + // First pass: try exact fits from largest to smallest + if seconds % YEAR_SECS == 0 && seconds / YEAR_SECS < 256 { + return TTL { count: (seconds / YEAR_SECS) as u8, unit: TTL_UNIT_YEAR }; + } + if seconds % MONTH_SECS == 0 && seconds / MONTH_SECS < 256 { + return TTL { count: (seconds / MONTH_SECS) as u8, unit: TTL_UNIT_MONTH }; + } + if seconds % WEEK_SECS == 0 && seconds / WEEK_SECS < 256 { + return TTL { count: (seconds / WEEK_SECS) as u8, unit: TTL_UNIT_WEEK }; + } + if seconds % DAY_SECS == 0 && seconds / DAY_SECS < 256 { + return TTL { count: (seconds / DAY_SECS) as u8, unit: TTL_UNIT_DAY }; + } + if seconds % HOUR_SECS == 0 && seconds / HOUR_SECS < 256 { + return TTL { count: (seconds / HOUR_SECS) as u8, unit: TTL_UNIT_HOUR }; + } + // Minutes: truncating division + if seconds / MINUTE_SECS < 256 { + return TTL { count: (seconds / MINUTE_SECS) as u8, unit: TTL_UNIT_MINUTE }; + } + // Second pass: truncating division from smallest to largest + if seconds / HOUR_SECS < 256 { + return TTL { count: (seconds / HOUR_SECS) as u8, unit: TTL_UNIT_HOUR }; + } + if seconds / DAY_SECS < 256 { + return TTL { count: (seconds / DAY_SECS) as u8, unit: TTL_UNIT_DAY }; + } + if seconds / WEEK_SECS < 256 { + return TTL { count: (seconds / WEEK_SECS) as u8, unit: TTL_UNIT_WEEK }; + } + if seconds / MONTH_SECS < 256 { + return TTL { count: (seconds / MONTH_SECS) as u8, unit: TTL_UNIT_MONTH }; + } + if seconds / YEAR_SECS < 256 { + return TTL { count: (seconds / YEAR_SECS) as u8, unit: TTL_UNIT_YEAR }; + } + TTL::EMPTY +} + +fn unit_to_char(unit: u8) -> char { + match unit { + TTL_UNIT_MINUTE => 'm', + TTL_UNIT_HOUR => 'h', + TTL_UNIT_DAY => 'd', + TTL_UNIT_WEEK => 'w', + TTL_UNIT_MONTH => 'M', + TTL_UNIT_YEAR => 'y', + _ => ' ', + } +} + +impl fmt::Display for TTL { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.count == 0 || self.unit == TTL_UNIT_EMPTY { + return write!(f, ""); + } + write!(f, "{}{}", self.count, unit_to_char(self.unit)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ttl_parse() { + let ttl = TTL::read("3m").unwrap(); + assert_eq!( + ttl, + TTL { + count: 3, + unit: TTL_UNIT_MINUTE + } + ); + assert_eq!(ttl.to_seconds(), 180); + } + + #[test] + fn test_ttl_parse_hours() { + // 24h normalizes to 1d via fitTtlCount + let ttl = TTL::read("24h").unwrap(); + assert_eq!(ttl.to_seconds(), 86400); + assert_eq!(ttl, TTL { count: 1, unit: TTL_UNIT_DAY }); + } + + #[test] + fn test_ttl_display() { + let ttl = TTL { + count: 5, + unit: TTL_UNIT_DAY, + }; + assert_eq!(ttl.to_string(), "5d"); + } + + #[test] + fn test_ttl_bytes_round_trip() { + let ttl = TTL { + count: 10, + unit: TTL_UNIT_WEEK, + }; + let mut buf = [0u8; 2]; + ttl.to_bytes(&mut buf); + let ttl2 = TTL::from_bytes(&buf); + assert_eq!(ttl, ttl2); + } + + #[test] + fn test_ttl_u32_round_trip() { + let ttl = TTL { + count: 42, + unit: TTL_UNIT_HOUR, + }; + let v = ttl.to_u32(); + let ttl2 = TTL::from_u32(v); + assert_eq!(ttl, ttl2); + } + + #[test] + fn test_ttl_empty() { + assert!(TTL::EMPTY.is_empty()); + assert_eq!(TTL::EMPTY.to_seconds(), 0); + assert_eq!(TTL::EMPTY.to_u32(), 0); + } + + #[test] + fn test_ttl_overflow_normalizes() { + // Go's ReadTTL calls fitTtlCount: 300m = 18000s = 5h (exact fit) + let ttl = TTL::read("300m").unwrap(); + assert_eq!(ttl, TTL { count: 5, unit: TTL_UNIT_HOUR }); + + // 256h = 921600s. Doesn't fit in hours (256 >= 256), doesn't fit exact in days. + // Second pass: 921600/86400 = 10 (truncated) < 256 -> 10d + let ttl = TTL::read("256h").unwrap(); + assert_eq!(ttl, TTL { count: 10, unit: TTL_UNIT_DAY }); + } + + #[test] + fn test_ttl_normalizes_unit() { + // Go's ReadTTL calls fitTtlCount which normalizes to coarsest unit. + // 120m -> 2h, 7d -> 1w, 24h -> 1d. + let ttl = TTL::read("120m").unwrap(); + assert_eq!(ttl, TTL { count: 2, unit: TTL_UNIT_HOUR }); + + let ttl = TTL::read("7d").unwrap(); + assert_eq!(ttl, TTL { count: 1, unit: TTL_UNIT_WEEK }); + + let ttl = TTL::read("24h").unwrap(); + assert_eq!(ttl, TTL { count: 1, unit: TTL_UNIT_DAY }); + + // Values that don't simplify stay as-is + let ttl = TTL::read("5d").unwrap(); + assert_eq!(ttl, TTL { count: 5, unit: TTL_UNIT_DAY }); + + let ttl = TTL::read("3m").unwrap(); + assert_eq!(ttl, TTL { count: 3, unit: TTL_UNIT_MINUTE }); + } +} diff --git a/seaweed-volume/src/storage/needle_map.rs b/seaweed-volume/src/storage/needle_map.rs new file mode 100644 index 000000000..248604e1d --- /dev/null +++ b/seaweed-volume/src/storage/needle_map.rs @@ -0,0 +1,1438 @@ +//! NeedleMapper: index mapping NeedleId -> (Offset, Size). +//! +//! Two implementations: +//! - `CompactNeedleMap`: in-memory segmented sorted arrays (~10 bytes/entry) +//! - `RedbNeedleMap`: disk-backed via redb (low RAM, slightly slower) +//! +//! The `NeedleMap` enum wraps both and provides a uniform interface. +//! Loaded from .idx file on volume mount. Supports Get, Put, Delete with +//! metrics tracking (file count, byte count, deleted count, deleted bytes). + +use std::collections::HashMap; +use std::io::{self, Read, Seek, Write}; +use std::path::Path; +use std::sync::atomic::{AtomicI64, AtomicU64, Ordering}; + +mod compact_map; +use compact_map::CompactMap; + +use redb::{Database, Durability, ReadableDatabase, ReadableTable, TableDefinition}; + +use crate::storage::idx; +use crate::storage::types::*; + +// ============================================================================ +// NeedleValue +// ============================================================================ + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct NeedleValue { + pub offset: Offset, + pub size: Size, +} + +/// Packed size of a NeedleValue in redb storage: OFFSET_SIZE + SIZE_SIZE. +const PACKED_NEEDLE_VALUE_SIZE: usize = OFFSET_SIZE + SIZE_SIZE; + +/// Pack an (Offset, Size) pair into bytes for redb storage. +/// Layout: [offset OFFSET_SIZE bytes] [size 4 bytes big-endian] +fn pack_needle_value(nv: &NeedleValue) -> [u8; PACKED_NEEDLE_VALUE_SIZE] { + let mut buf = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + nv.offset.to_bytes(&mut buf[..OFFSET_SIZE]); + nv.size.to_bytes(&mut buf[OFFSET_SIZE..]); + buf +} + +/// Unpack bytes from redb storage into (Offset, Size). +fn unpack_needle_value(bytes: &[u8; PACKED_NEEDLE_VALUE_SIZE]) -> NeedleValue { + NeedleValue { + offset: Offset::from_bytes(&bytes[..OFFSET_SIZE]), + size: Size::from_bytes(&bytes[OFFSET_SIZE..]), + } +} + +// ============================================================================ +// NeedleMapMetric +// ============================================================================ + +/// Metrics tracking for needle map operations. +#[derive(Debug, Default)] +pub struct NeedleMapMetric { + pub file_count: AtomicI64, + pub file_byte_count: AtomicU64, + pub deletion_count: AtomicI64, + pub deletion_byte_count: AtomicU64, + pub max_file_key: AtomicU64, +} + +impl NeedleMapMetric { + /// Update metrics based on a Put operation (additive-only, matching Go's logPut). + fn on_put(&self, key: NeedleId, old: Option<&NeedleValue>, new_size: Size) { + self.maybe_set_max_file_key(key); + // Go: always LogFileCounter(newSize) which does FileCounter++ and FileByteCounter += newSize + self.file_count.fetch_add(1, Ordering::Relaxed); + self.file_byte_count + .fetch_add(new_size.0 as u64, Ordering::Relaxed); + // Go: if oldSize > 0 && oldSize.IsValid() { LogDeletionCounter(oldSize) } + if let Some(old_val) = old { + if old_val.size.0 > 0 && old_val.size.is_valid() { + self.deletion_count.fetch_add(1, Ordering::Relaxed); + self.deletion_byte_count + .fetch_add(old_val.size.0 as u64, Ordering::Relaxed); + } + } + } + + /// Update metrics based on a Delete operation (additive-only, matching Go's logDelete). + fn on_delete(&self, old: &NeedleValue) { + if old.size.0 > 0 { + self.deletion_count.fetch_add(1, Ordering::Relaxed); + self.deletion_byte_count + .fetch_add(old.size.0 as u64, Ordering::Relaxed); + } + } + + fn maybe_set_max_file_key(&self, key: NeedleId) { + let key_val: u64 = key.into(); + loop { + let current = self.max_file_key.load(Ordering::Relaxed); + if key_val <= current { + break; + } + if self + .max_file_key + .compare_exchange(current, key_val, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + break; + } + } + } +} + +// ============================================================================ +// NeedleMapKind +// ============================================================================ + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum NeedleMapKind { + InMemory, + LevelDb, + LevelDbMedium, + LevelDbLarge, +} + +// ============================================================================ +// IdxFileWriter trait +// ============================================================================ + +/// Trait for appending to an index file. +pub trait IdxFileWriter: Write + Send + Sync { + fn sync_all(&self) -> io::Result<()>; +} + +impl IdxFileWriter for std::fs::File { + fn sync_all(&self) -> io::Result<()> { + std::fs::File::sync_all(self) + } +} + +// ============================================================================ +// CompactNeedleMap (in-memory) +// ============================================================================ + +/// In-memory needle map backed by a CompactMap (segmented sorted arrays). +/// Uses ~10 bytes per entry instead of ~40-48 bytes with HashMap. +/// The .idx file is kept open for append-only writes. +pub struct CompactNeedleMap { + map: CompactMap, + metric: NeedleMapMetric, + idx_file: Option>, + idx_file_offset: u64, +} + +impl CompactNeedleMap { + /// Create a new empty in-memory map. + pub fn new() -> Self { + CompactNeedleMap { + map: CompactMap::new(), + metric: NeedleMapMetric::default(), + idx_file: None, + idx_file_offset: 0, + } + } + + /// Load from an .idx file, building the in-memory map. + pub fn load_from_idx(reader: &mut R) -> io::Result { + let mut nm = CompactNeedleMap::new(); + idx::walk_index_file(reader, 0, |key, offset, size| { + if offset.is_zero() || size.is_deleted() { + nm.delete_from_map(key); + } else { + nm.set_internal(key, NeedleValue { offset, size }); + } + Ok(()) + })?; + Ok(nm) + } + + /// Set the index file for append-only writes. + pub fn set_idx_file(&mut self, file: Box, offset: u64) { + self.idx_file = Some(file); + self.idx_file_offset = offset; + } + + // ---- Map operations ---- + + /// Insert or update an entry. Appends to .idx file if present. + pub fn put(&mut self, key: NeedleId, offset: Offset, size: Size) -> io::Result<()> { + // Persist to idx file BEFORE mutating in-memory state for crash consistency + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, size)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + let old = self.map.get(key); + self.metric.on_put(key, old.as_ref(), size); + self.map.set(key, offset, size); + Ok(()) + } + + /// Look up a needle. + pub fn get(&self, key: NeedleId) -> Option { + self.map.get(key) + } + + /// Mark a needle as deleted. Appends tombstone to .idx file. + /// Matches Go's NeedleMap.Delete: ALWAYS writes tombstone to idx and + /// increments deletion counter, even if needle doesn't exist or is + /// already deleted (important for replication). + pub fn delete(&mut self, key: NeedleId, offset: Offset) -> io::Result> { + // Go unconditionally calls nm.m.Delete(), nm.logDelete(), nm.appendToIndexFile() + let deleted_bytes = self.map.delete(key).unwrap_or(Size(0)); + + // Match Go's logDelete -> LogDeletionCounter: only increment when oldSize > 0. + // Go does NOT decrement FileCounter/FileByteCounter in Delete; + // live counts are computed as FileCounter - DeletionCounter. + if deleted_bytes.0 > 0 { + self.metric.deletion_count.fetch_add(1, Ordering::Relaxed); + self.metric + .deletion_byte_count + .fetch_add(deleted_bytes.0 as u64, Ordering::Relaxed); + } + + // Always write tombstone to idx file (matching Go) + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, TOMBSTONE_FILE_SIZE)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + if deleted_bytes.0 > 0 { + Ok(Some(deleted_bytes)) + } else { + Ok(None) + } + } + + // ---- Internal helpers ---- + + /// Insert into map during loading (no idx file write). + fn set_internal(&mut self, key: NeedleId, nv: NeedleValue) { + let old = self.map.get(key); + self.metric.on_put(key, old.as_ref(), nv.size); + self.map.set(key, nv.offset, nv.size); + } + + /// Remove from map during loading (handle deletions in idx walk). + /// Matches Go's doLoading else branch: always increments DeletionCounter, + /// and adds old size bytes to DeletionByteCounter. + fn delete_from_map(&mut self, key: NeedleId) { + self.metric.maybe_set_max_file_key(key); + // Go's CompactMap.Delete returns old size (0 if not found or already deleted). + // Go's doLoading always does DeletionCounter++ and DeletionByteCounter += uint64(oldSize). + let old_size = self.map.get(key).map(|nv| nv.size).unwrap_or(Size(0)); + // Go unconditionally increments DeletionCounter + self.metric.deletion_count.fetch_add(1, Ordering::Relaxed); + // Go adds uint64(oldSize) which for valid sizes adds the value, for 0/negative adds 0 + if old_size.0 > 0 { + self.metric + .deletion_byte_count + .fetch_add(old_size.0 as u64, Ordering::Relaxed); + } + self.map.remove(key); + } + + // ---- Metrics accessors ---- + + pub fn content_size(&self) -> u64 { + self.metric.file_byte_count.load(Ordering::Relaxed) + } + + pub fn deleted_size(&self) -> u64 { + self.metric.deletion_byte_count.load(Ordering::Relaxed) + } + + pub fn file_count(&self) -> i64 { + self.metric.file_count.load(Ordering::Relaxed) + } + + pub fn deleted_count(&self) -> i64 { + self.metric.deletion_count.load(Ordering::Relaxed) + } + + pub fn max_file_key(&self) -> NeedleId { + NeedleId(self.metric.max_file_key.load(Ordering::Relaxed)) + } + + pub fn index_file_size(&self) -> u64 { + self.idx_file_offset + } + + /// Sync index file to disk. + pub fn sync(&self) -> io::Result<()> { + if let Some(ref idx_file) = self.idx_file { + idx_file.sync_all()?; + } + Ok(()) + } + + /// Close index file. + pub fn close(&mut self) { + let _ = self.sync(); + self.idx_file = None; + } + + /// Save the in-memory map to an index file, sorted by needle ID ascending. + pub fn save_to_idx(&self, path: &str) -> io::Result<()> { + let mut file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path)?; + + self.map.ascending_visit(|id, nv| { + if nv.size.is_valid() { + idx::write_index_entry(&mut file, id, nv.offset, nv.size) + } else { + Ok(()) + } + })?; + file.sync_all()?; + Ok(()) + } + + /// Visit all entries in ascending order by needle ID. + pub fn ascending_visit(&self, f: F) -> Result<(), String> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), String>, + { + self.map.ascending_visit(f) + } +} + +// ============================================================================ +// RedbNeedleMap (disk-backed via redb) +// ============================================================================ + +/// redb table: NeedleId (u64) -> packed [offset(4) + size(4)] +const NEEDLE_TABLE: TableDefinition = TableDefinition::new("needles"); + +/// Metadata table: stores the .idx file size that was used to build this redb. +/// Key "idx_size" -> u64 byte offset. Used to detect whether the .rdb can be +/// reused on restart or needs a full/incremental rebuild. +const META_TABLE: TableDefinition<&str, u64> = TableDefinition::new("meta"); +const META_IDX_SIZE: &str = "idx_size"; + +/// Disk-backed needle map using redb. +/// Low memory usage — data lives on disk with redb's page cache. +pub struct RedbNeedleMap { + db: Database, + metric: NeedleMapMetric, + idx_file: Option>, + idx_file_offset: u64, +} + +impl RedbNeedleMap { + /// Begin a write transaction with `Durability::None` (no fsync). + /// The .idx file is the source of truth for crash recovery, so redb + /// is always rebuilt from .idx on startup — fsync is unnecessary. + fn begin_write_no_fsync(db: &Database) -> io::Result { + let mut txn = db.begin_write().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb begin_write: {}", e)) + })?; + let _ = txn.set_durability(Durability::None); + Ok(txn) + } + + /// Create a new redb-backed needle map at the given path. + /// The database file will be created if it does not exist. + pub fn new(db_path: &str) -> io::Result { + let db = Database::create(db_path).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb create error: {}", e)) + })?; + + // Ensure tables exist + let txn = Self::begin_write_no_fsync(&db)?; + { + let _table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + let _meta = txn.open_table(META_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table meta: {}", e)) + })?; + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + Ok(RedbNeedleMap { + db, + metric: NeedleMapMetric::default(), + idx_file: None, + idx_file_offset: 0, + }) + } + + /// Save the .idx file size into redb metadata so we can detect whether + /// the .rdb is up-to-date on the next startup. + fn save_idx_size_meta(&self, idx_size: u64) -> io::Result<()> { + let txn = Self::begin_write_no_fsync(&self.db)?; + { + let mut meta = txn.open_table(META_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open meta: {}", e)) + })?; + meta.insert(META_IDX_SIZE, idx_size).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert meta: {}", e)) + })?; + } + txn.commit().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb commit meta: {}", e)) + })?; + Ok(()) + } + + /// Read the stored .idx file size from redb metadata. + fn read_idx_size_meta(&self) -> io::Result> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let meta = txn + .open_table(META_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open meta: {}", e)))?; + match meta.get(META_IDX_SIZE) { + Ok(Some(guard)) => Ok(Some(guard.value())), + Ok(None) => Ok(None), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("redb get meta: {}", e), + )), + } + } + + /// Rebuild metrics by scanning all entries in the redb table. + /// Called when reusing an existing .rdb without a full rebuild. + fn rebuild_metrics_from_db(&self) -> io::Result<()> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)))?; + let iter = table + .iter() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb iter: {}", e)))?; + for entry in iter { + let (key_guard, val_guard) = entry.map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb iter next: {}", e)) + })?; + let key = NeedleId(key_guard.value()); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + self.metric.maybe_set_max_file_key(key); + if nv.size.is_valid() { + self.metric.file_count.fetch_add(1, Ordering::Relaxed); + self.metric + .file_byte_count + .fetch_add(nv.size.0 as u64, Ordering::Relaxed); + } else { + // Deleted entry (negative size) + self.metric.deletion_count.fetch_add(1, Ordering::Relaxed); + self.metric + .deletion_byte_count + .fetch_add((-nv.size.0) as u64, Ordering::Relaxed); + } + } + } + Ok(()) + } + + /// Load from an .idx file, reusing an existing .rdb if it is consistent. + /// + /// Strategy: + /// 1. Try to open existing .rdb and read its stored .idx size + /// 2. If .idx size matches → reuse .rdb, rebuild metrics from scan + /// 3. If .idx is larger → replay new entries incrementally + /// 4. Otherwise (missing, corrupted, .idx smaller) → full rebuild + pub fn load_from_idx(db_path: &str, reader: &mut R) -> io::Result { + let idx_size = reader.seek(io::SeekFrom::End(0))?; + reader.seek(io::SeekFrom::Start(0))?; + + // Try to reuse existing .rdb + if Path::new(db_path).exists() { + if let Ok(nm) = Self::try_reuse_rdb(db_path, reader, idx_size) { + return Ok(nm); + } + // Reuse failed — fall through to full rebuild + reader.seek(io::SeekFrom::Start(0))?; + } + + Self::full_rebuild(db_path, reader, idx_size) + } + + /// Try to reuse an existing .rdb file. Returns Ok if successful, + /// Err if a full rebuild is needed. + fn try_reuse_rdb( + db_path: &str, + reader: &mut R, + idx_size: u64, + ) -> io::Result { + let db = Database::open(db_path) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open: {}", e)))?; + + let nm = RedbNeedleMap { + db, + metric: NeedleMapMetric::default(), + idx_file: None, + idx_file_offset: 0, + }; + + let stored_idx_size = nm + .read_idx_size_meta()? + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "no idx_size in redb meta"))?; + + if stored_idx_size > idx_size { + // .idx shrank — corrupted or truncated, need full rebuild + return Err(io::Error::new( + io::ErrorKind::Other, + "idx file smaller than stored size", + )); + } + + // Rebuild metrics from existing data + nm.rebuild_metrics_from_db()?; + + if stored_idx_size < idx_size { + // .idx grew — replay new entries incrementally + let start_entry = stored_idx_size / NEEDLE_MAP_ENTRY_SIZE as u64; + let txn = Self::begin_write_no_fsync(&nm.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + idx::walk_index_file(reader, start_entry, |key, offset, size| { + let key_u64: u64 = key.into(); + if offset.is_zero() || size.is_deleted() { + // Delete: look up old value for metric update, then + // store tombstone (negative size with original offset) + if let Ok(Some(old)) = nm.get_via_table(&table, key_u64) { + if old.size.is_valid() { + nm.metric.on_delete(&old); + let deleted_nv = NeedleValue { + offset: old.offset, + size: Size(-(old.size.0)), + }; + let packed = pack_needle_value(&deleted_nv); + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("redb insert: {}", e), + ) + })?; + } + } + } else { + // Put: look up old value for metric update + let old = nm.get_via_table(&table, key_u64).ok().flatten(); + let nv = NeedleValue { offset, size }; + let packed = pack_needle_value(&nv); + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)) + })?; + nm.metric.on_put(key, old.as_ref(), size); + } + Ok(()) + })?; + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + nm.save_idx_size_meta(idx_size)?; + } + + Ok(nm) + } + + /// Look up a needle value using an already-open table reference. + /// Used during incremental replay to avoid opening separate read transactions. + fn get_via_table( + &self, + table: &redb::Table, + key_u64: u64, + ) -> io::Result> { + match table.get(key_u64) { + Ok(Some(guard)) => { + let bytes: &[u8] = guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + Ok(Some(unpack_needle_value(&arr))) + } else { + Ok(None) + } + } + Ok(None) => Ok(None), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("redb get: {}", e), + )), + } + } + + /// Full rebuild: delete existing .rdb and rebuild from entire .idx file. + fn full_rebuild( + db_path: &str, + reader: &mut R, + idx_size: u64, + ) -> io::Result { + let _ = std::fs::remove_file(db_path); + let nm = RedbNeedleMap::new(db_path)?; + + // Collect entries from idx file, resolving duplicates/deletions + let mut entries: HashMap> = HashMap::new(); + idx::walk_index_file(reader, 0, |key, offset, size| { + if offset.is_zero() || size.is_deleted() { + entries.insert(key, None); + } else { + entries.insert(key, Some(NeedleValue { offset, size })); + } + Ok(()) + })?; + + // Write all live entries to redb in a single transaction + let txn = Self::begin_write_no_fsync(&nm.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + + for (key, maybe_nv) in &entries { + let key_u64: u64 = (*key).into(); + if let Some(nv) = maybe_nv { + let packed = pack_needle_value(nv); + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)) + })?; + nm.metric.on_put(*key, None, nv.size); + } else { + // Entry was deleted — remove from redb if present + table.remove(key_u64).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb remove: {}", e)) + })?; + } + } + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + nm.save_idx_size_meta(idx_size)?; + + Ok(nm) + } + + /// Set the index file for append-only writes. + pub fn set_idx_file(&mut self, file: Box, offset: u64) { + self.idx_file = Some(file); + self.idx_file_offset = offset; + } + + // ---- Map operations ---- + + /// Insert or update an entry. Writes to idx file first, then redb. + pub fn put(&mut self, key: NeedleId, offset: Offset, size: Size) -> io::Result<()> { + // Persist to idx file BEFORE mutating redb state for crash consistency + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, size)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + let key_u64: u64 = key.into(); + let nv = NeedleValue { offset, size }; + let packed = pack_needle_value(&nv); + + // Read old value for metric update + let old = self.get_internal(key_u64)?; + + let txn = Self::begin_write_no_fsync(&self.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + table + .insert(key_u64, packed.as_slice()) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)))?; + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + self.metric.on_put(key, old.as_ref(), size); + Ok(()) + } + + /// Look up a needle. + pub fn get(&self, key: NeedleId) -> Option { + let key_u64: u64 = key.into(); + self.get_internal(key_u64).ok().flatten() + } + + /// Internal get that returns io::Result for error propagation. + fn get_internal(&self, key_u64: u64) -> io::Result> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)))?; + match table.get(key_u64) { + Ok(Some(guard)) => { + let bytes: &[u8] = guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + Ok(Some(unpack_needle_value(&arr))) + } else { + Ok(None) + } + } + Ok(None) => Ok(None), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("redb get: {}", e), + )), + } + } + + /// Mark a needle as deleted. Appends tombstone to .idx file, negates size in redb. + pub fn delete(&mut self, key: NeedleId, offset: Offset) -> io::Result> { + let key_u64: u64 = key.into(); + + if let Some(old) = self.get_internal(key_u64)? { + if old.size.is_valid() { + // Persist tombstone to idx file BEFORE mutating redb + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, TOMBSTONE_FILE_SIZE)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + self.metric.on_delete(&old); + let deleted_size = Size(-(old.size.0)); + // Keep original offset so readDeleted can find original data (matching Go behavior) + let deleted_nv = NeedleValue { + offset: old.offset, + size: deleted_size, + }; + let packed = pack_needle_value(&deleted_nv); + + let txn = Self::begin_write_no_fsync(&self.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)) + })?; + } + txn.commit().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)) + })?; + + return Ok(Some(old.size)); + } + } + Ok(None) + } + + // ---- Metrics accessors ---- + + pub fn content_size(&self) -> u64 { + self.metric.file_byte_count.load(Ordering::Relaxed) + } + + pub fn deleted_size(&self) -> u64 { + self.metric.deletion_byte_count.load(Ordering::Relaxed) + } + + pub fn file_count(&self) -> i64 { + self.metric.file_count.load(Ordering::Relaxed) + } + + pub fn deleted_count(&self) -> i64 { + self.metric.deletion_count.load(Ordering::Relaxed) + } + + pub fn max_file_key(&self) -> NeedleId { + NeedleId(self.metric.max_file_key.load(Ordering::Relaxed)) + } + + pub fn index_file_size(&self) -> u64 { + self.idx_file_offset + } + + /// Sync index file to disk. + pub fn sync(&self) -> io::Result<()> { + if let Some(ref idx_file) = self.idx_file { + idx_file.sync_all()?; + } + Ok(()) + } + + /// Close index file. + pub fn close(&mut self) { + let _ = self.sync(); + self.idx_file = None; + } + + /// Save the redb contents to an index file, sorted by needle ID ascending. + pub fn save_to_idx(&self, path: &str) -> io::Result<()> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)))?; + + let mut file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path)?; + + // redb iterates in key order (u64 ascending) + let iter = table + .iter() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb iter: {}", e)))?; + + for entry in iter { + let (key_guard, val_guard) = entry.map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb iter next: {}", e)) + })?; + let key_u64: u64 = key_guard.value(); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + if nv.size.is_valid() { + idx::write_index_entry(&mut file, NeedleId(key_u64), nv.offset, nv.size)?; + } + } + } + file.sync_all()?; + Ok(()) + } + + /// Visit all entries in ascending order by needle ID. + pub fn ascending_visit(&self, mut f: F) -> Result<(), String> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), String>, + { + let txn = self + .db + .begin_read() + .map_err(|e| format!("redb begin_read: {}", e))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| format!("redb open_table: {}", e))?; + let iter = table.iter().map_err(|e| format!("redb iter: {}", e))?; + + for entry in iter { + let (key_guard, val_guard) = entry.map_err(|e| format!("redb iter next: {}", e))?; + let key_u64: u64 = key_guard.value(); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + f(NeedleId(key_u64), &nv)?; + } + } + Ok(()) + } + + /// Collect all entries as a Vec for iteration (used by volume.rs iter patterns). + pub fn collect_entries(&self) -> Vec<(NeedleId, NeedleValue)> { + let mut result = Vec::new(); + let txn: redb::ReadTransaction = match self.db.begin_read() { + Ok(t) => t, + Err(_) => return result, + }; + let table = match txn.open_table(NEEDLE_TABLE) { + Ok(t) => t, + Err(_) => return result, + }; + let iter = match table.iter() { + Ok(i) => i, + Err(_) => return result, + }; + for entry in iter { + if let Ok((key_guard, val_guard)) = entry { + let key_u64: u64 = key_guard.value(); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + result.push((NeedleId(key_u64), nv)); + } + } + } + result + } +} + +// ============================================================================ +// NeedleMap enum — unified interface over both implementations +// ============================================================================ + +/// Unified needle map wrapping either in-memory or redb-backed storage. +pub enum NeedleMap { + InMemory(CompactNeedleMap), + Redb(RedbNeedleMap), +} + +impl NeedleMap { + /// Insert or update an entry. + pub fn put(&mut self, key: NeedleId, offset: Offset, size: Size) -> io::Result<()> { + match self { + NeedleMap::InMemory(nm) => nm.put(key, offset, size), + NeedleMap::Redb(nm) => nm.put(key, offset, size), + } + } + + /// Look up a needle. + pub fn get(&self, key: NeedleId) -> Option { + match self { + NeedleMap::InMemory(nm) => nm.get(key), + NeedleMap::Redb(nm) => nm.get(key), + } + } + + /// Mark a needle as deleted. + pub fn delete(&mut self, key: NeedleId, offset: Offset) -> io::Result> { + match self { + NeedleMap::InMemory(nm) => nm.delete(key, offset), + NeedleMap::Redb(nm) => nm.delete(key, offset), + } + } + + /// Set the index file for append-only writes. + pub fn set_idx_file(&mut self, file: Box, offset: u64) { + match self { + NeedleMap::InMemory(nm) => nm.set_idx_file(file, offset), + NeedleMap::Redb(nm) => nm.set_idx_file(file, offset), + } + } + + /// Content byte count. + pub fn content_size(&self) -> u64 { + match self { + NeedleMap::InMemory(nm) => nm.content_size(), + NeedleMap::Redb(nm) => nm.content_size(), + } + } + + /// Deleted byte count. + pub fn deleted_size(&self) -> u64 { + match self { + NeedleMap::InMemory(nm) => nm.deleted_size(), + NeedleMap::Redb(nm) => nm.deleted_size(), + } + } + + /// Live file count. + pub fn file_count(&self) -> i64 { + match self { + NeedleMap::InMemory(nm) => nm.file_count(), + NeedleMap::Redb(nm) => nm.file_count(), + } + } + + /// Deleted file count. + pub fn deleted_count(&self) -> i64 { + match self { + NeedleMap::InMemory(nm) => nm.deleted_count(), + NeedleMap::Redb(nm) => nm.deleted_count(), + } + } + + /// Maximum needle ID seen. + pub fn max_file_key(&self) -> NeedleId { + match self { + NeedleMap::InMemory(nm) => nm.max_file_key(), + NeedleMap::Redb(nm) => nm.max_file_key(), + } + } + + /// Index file size in bytes. + pub fn index_file_size(&self) -> u64 { + match self { + NeedleMap::InMemory(nm) => nm.index_file_size(), + NeedleMap::Redb(nm) => nm.index_file_size(), + } + } + + /// Sync index file to disk. + pub fn sync(&self) -> io::Result<()> { + match self { + NeedleMap::InMemory(nm) => nm.sync(), + NeedleMap::Redb(nm) => nm.sync(), + } + } + + /// Close index file. + pub fn close(&mut self) { + match self { + NeedleMap::InMemory(nm) => nm.close(), + NeedleMap::Redb(nm) => nm.close(), + } + } + + /// Save to an index file. + pub fn save_to_idx(&self, path: &str) -> io::Result<()> { + match self { + NeedleMap::InMemory(nm) => nm.save_to_idx(path), + NeedleMap::Redb(nm) => nm.save_to_idx(path), + } + } + + /// Visit all entries in ascending order by needle ID. + pub fn ascending_visit(&self, f: F) -> Result<(), String> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), String>, + { + match self { + NeedleMap::InMemory(nm) => nm.ascending_visit(f), + NeedleMap::Redb(nm) => nm.ascending_visit(f), + } + } + + /// Iterate all entries. Returns a Vec of (NeedleId, NeedleValue) pairs. + /// For InMemory this collects via ascending visit; for Redb it reads from disk. + pub fn iter_entries(&self) -> Vec<(NeedleId, NeedleValue)> { + match self { + NeedleMap::InMemory(nm) => { + let mut entries = Vec::new(); + let _ = nm.ascending_visit(|id, nv| { + entries.push((id, *nv)); + Ok(()) + }); + entries + } + NeedleMap::Redb(nm) => nm.collect_entries(), + } + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_needle_map_put_get() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + + let v1 = nm.get(NeedleId(1)).unwrap(); + assert_eq!(v1.size, Size(100)); + + let v2 = nm.get(NeedleId(2)).unwrap(); + assert_eq!(v2.size, Size(200)); + + assert!(nm.get(NeedleId(99)).is_none()); + } + + #[test] + fn test_needle_map_delete() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.content_size(), 100); + + let deleted = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(deleted, Some(Size(100))); + + // Additive-only: file_count stays at 1 after delete + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.deleted_count(), 1); + assert_eq!(nm.deleted_size(), 100); + } + + #[test] + fn test_needle_map_metrics() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + + assert_eq!(nm.file_count(), 3); + assert_eq!(nm.content_size(), 600); + assert_eq!(nm.max_file_key(), NeedleId(3)); + + // Update existing — additive-only: file_count increments, content_size adds + nm.put(NeedleId(2), Offset::from_actual_offset(700), Size(250)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // 3 + 1 (always increments) + assert_eq!(nm.content_size(), 850); // 600 + 250 (always adds) + + // Delete — additive-only: file_count unchanged + nm.delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // unchanged + assert_eq!(nm.deleted_count(), 2); // 1 from overwrite + 1 from delete + } + + #[test] + fn test_needle_map_load_from_idx() { + // Build an idx file in memory + // Note: offset 0 is reserved for the SuperBlock, so real needles start at offset >= 8 + let mut idx_data = Vec::new(); + idx::write_index_entry( + &mut idx_data, + NeedleId(1), + Offset::from_actual_offset(8), + Size(100), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::from_actual_offset(128), + Size(200), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(3), + Offset::from_actual_offset(384), + Size(300), + ) + .unwrap(); + // Delete needle 2 + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::default(), + TOMBSTONE_FILE_SIZE, + ) + .unwrap(); + + let mut cursor = Cursor::new(idx_data); + let nm = CompactNeedleMap::load_from_idx(&mut cursor).unwrap(); + + assert!(nm.get(NeedleId(1)).is_some()); + assert!(nm.get(NeedleId(2)).is_none()); // deleted + assert!(nm.get(NeedleId(3)).is_some()); + // Additive-only: put(1)+put(2)+put(3) = 3, delete doesn't decrement + assert_eq!(nm.file_count(), 3); + } + + #[test] + fn test_needle_map_double_delete() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + + let r1 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r1, Some(Size(100))); + + // Second delete should return None (already deleted) + let r2 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r2, None); + assert_eq!(nm.deleted_count(), 1); // not double counted + } + + // ---- RedbNeedleMap tests ---- + + #[test] + fn test_redb_needle_map_put_get() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + + let v1 = nm.get(NeedleId(1)).unwrap(); + assert_eq!(v1.size, Size(100)); + + let v2 = nm.get(NeedleId(2)).unwrap(); + assert_eq!(v2.size, Size(200)); + + assert!(nm.get(NeedleId(99)).is_none()); + } + + #[test] + fn test_redb_needle_map_delete() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.content_size(), 100); + + let deleted = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(deleted, Some(Size(100))); + + // Additive-only: file_count stays at 1 after delete + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.deleted_count(), 1); + assert_eq!(nm.deleted_size(), 100); + + // Deleted entry should have negated size + let nv = nm.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, Size(-100)); + } + + #[test] + fn test_redb_needle_map_metrics() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + + assert_eq!(nm.file_count(), 3); + assert_eq!(nm.content_size(), 600); + assert_eq!(nm.max_file_key(), NeedleId(3)); + + // Update existing — additive-only: file_count increments, content_size adds + nm.put(NeedleId(2), Offset::from_actual_offset(700), Size(250)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // 3 + 1 (always increments) + assert_eq!(nm.content_size(), 850); // 600 + 250 (always adds) + + // Delete — additive-only: file_count unchanged + nm.delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // unchanged + assert_eq!(nm.deleted_count(), 2); // 1 from overwrite + 1 from delete + } + + #[test] + fn test_redb_needle_map_load_from_idx() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + + let mut idx_data = Vec::new(); + idx::write_index_entry( + &mut idx_data, + NeedleId(1), + Offset::from_actual_offset(8), + Size(100), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::from_actual_offset(128), + Size(200), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(3), + Offset::from_actual_offset(384), + Size(300), + ) + .unwrap(); + // Delete needle 2 + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::default(), + TOMBSTONE_FILE_SIZE, + ) + .unwrap(); + + let mut cursor = Cursor::new(idx_data); + let nm = RedbNeedleMap::load_from_idx(db_path.to_str().unwrap(), &mut cursor).unwrap(); + + assert!(nm.get(NeedleId(1)).is_some()); + assert!(nm.get(NeedleId(2)).is_none()); // deleted and removed + assert!(nm.get(NeedleId(3)).is_some()); + assert_eq!(nm.file_count(), 2); + } + + #[test] + fn test_redb_needle_map_double_delete() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + + let r1 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r1, Some(Size(100))); + + // Second delete should return None (already deleted) + let r2 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r2, None); + assert_eq!(nm.deleted_count(), 1); // not double counted + } + + #[test] + fn test_redb_needle_map_ascending_visit() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + + let mut visited = Vec::new(); + nm.ascending_visit(|id, nv| { + visited.push((id, nv.size)); + Ok(()) + }) + .unwrap(); + + assert_eq!(visited.len(), 3); + assert_eq!(visited[0], (NeedleId(1), Size(100))); + assert_eq!(visited[1], (NeedleId(2), Size(200))); + assert_eq!(visited[2], (NeedleId(3), Size(300))); + } + + #[test] + fn test_redb_needle_map_save_to_idx() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let idx_path = dir.path().join("test.idx"); + + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + nm.put(NeedleId(1), Offset::from_actual_offset(8), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + // Delete needle 2 + nm.delete(NeedleId(2), Offset::from_actual_offset(128)) + .unwrap(); + + nm.save_to_idx(idx_path.to_str().unwrap()).unwrap(); + + // Load back with CompactNeedleMap to verify + let mut idx_file = std::fs::File::open(&idx_path).unwrap(); + let loaded = CompactNeedleMap::load_from_idx(&mut idx_file).unwrap(); + assert_eq!(loaded.file_count(), 2); // only live entries + assert!(loaded.get(NeedleId(1)).is_some()); + assert!(loaded.get(NeedleId(2)).is_none()); // deleted, not saved + assert!(loaded.get(NeedleId(3)).is_some()); + } + + #[test] + fn test_pack_unpack_needle_value() { + let nv = NeedleValue { + offset: Offset::from_actual_offset(8 * 1000), + size: Size(4096), + }; + let packed = pack_needle_value(&nv); + let unpacked = unpack_needle_value(&packed); + assert_eq!( + nv.offset.to_actual_offset(), + unpacked.offset.to_actual_offset() + ); + assert_eq!(nv.size, unpacked.size); + } + + #[test] + fn test_pack_unpack_negative_size() { + let nv = NeedleValue { + offset: Offset::from_actual_offset(8 * 500), + size: Size(-100), + }; + let packed = pack_needle_value(&nv); + let unpacked = unpack_needle_value(&packed); + assert_eq!( + nv.offset.to_actual_offset(), + unpacked.offset.to_actual_offset() + ); + assert_eq!(nv.size, unpacked.size); + } + + // ---- NeedleMap enum tests ---- + + #[test] + fn test_needle_map_enum_inmemory() { + let mut nm = NeedleMap::InMemory(CompactNeedleMap::new()); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + assert_eq!(nm.get(NeedleId(1)).unwrap().size, Size(100)); + assert_eq!(nm.file_count(), 1); + } + + #[test] + fn test_needle_map_enum_redb() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = NeedleMap::Redb(RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap()); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + assert_eq!(nm.get(NeedleId(1)).unwrap().size, Size(100)); + assert_eq!(nm.file_count(), 1); + } +} diff --git a/seaweed-volume/src/storage/needle_map/compact_map.rs b/seaweed-volume/src/storage/needle_map/compact_map.rs new file mode 100644 index 000000000..9dea94ce7 --- /dev/null +++ b/seaweed-volume/src/storage/needle_map/compact_map.rs @@ -0,0 +1,375 @@ +//! CompactMap: memory-efficient in-memory map of NeedleId -> (Offset, Size). +//! +//! Port of Go's CompactMap from weed/storage/needle_map/compact_map.go. +//! Uses segmented sorted arrays with compressed keys (u16 instead of u64) +//! to achieve ~10 bytes per entry instead of ~40-48 bytes with HashMap. +//! +//! NeedleId is split into: chunk = id / SEGMENT_CHUNK_SIZE, compact_key = id % SEGMENT_CHUNK_SIZE. +//! Each segment stores up to SEGMENT_CHUNK_SIZE entries in a sorted Vec, searched via binary search. +//! Best case (ordered inserts): O(1). Worst case: O(log n) per segment. + +use std::collections::HashMap; + +use super::NeedleValue; +use crate::storage::types::*; + +/// Maximum entries per segment. Must be <= u16::MAX (65535). +const SEGMENT_CHUNK_SIZE: u64 = 50_000; + +/// Compact key: only the low bits of NeedleId within a segment. +type CompactKey = u16; + +/// Segment chunk identifier: NeedleId / SEGMENT_CHUNK_SIZE. +type Chunk = u64; + +/// Compact entry: 10 bytes (2 + 4 + 4) vs 16 bytes for full NeedleId + NeedleValue. +#[derive(Clone, Copy)] +struct CompactEntry { + key: CompactKey, // 2 bytes + offset: [u8; OFFSET_SIZE], // 4 bytes + size: Size, // 4 bytes +} + +impl CompactEntry { + fn to_needle_value(&self) -> NeedleValue { + NeedleValue { + offset: Offset::from_bytes(&self.offset), + size: self.size, + } + } +} + +/// A sorted segment of compact entries for a given chunk. +struct Segment { + list: Vec, + chunk: Chunk, + first_key: CompactKey, + last_key: CompactKey, +} + +impl Segment { + fn new(chunk: Chunk) -> Self { + Segment { + list: Vec::new(), + chunk, + first_key: u16::MAX, + last_key: 0, + } + } + + fn compact_key(&self, id: NeedleId) -> CompactKey { + (id.0 - SEGMENT_CHUNK_SIZE * self.chunk) as CompactKey + } + + /// Binary search for a compact key. Returns (index, found). + /// If not found, index is the insertion point. + fn bsearch(&self, id: NeedleId) -> (usize, bool) { + let ck = self.compact_key(id); + + if self.list.is_empty() { + return (0, false); + } + if ck == self.first_key { + return (0, true); + } + if ck < self.first_key { + return (0, false); + } + if ck == self.last_key { + return (self.list.len() - 1, true); + } + if ck > self.last_key { + return (self.list.len(), false); + } + + let i = self.list.partition_point(|e| e.key < ck); + if i < self.list.len() && self.list[i].key == ck { + (i, true) + } else { + (i, false) + } + } + + /// Insert or update. Returns old NeedleValue if updating. + fn set(&mut self, id: NeedleId, offset: Offset, size: Size) -> Option { + let (i, found) = self.bsearch(id); + + if found { + let old = self.list[i].to_needle_value(); + let mut offset_bytes = [0u8; OFFSET_SIZE]; + offset.to_bytes(&mut offset_bytes); + self.list[i].offset = offset_bytes; + self.list[i].size = size; + return Some(old); + } + + // Insert at sorted position + let ck = self.compact_key(id); + let mut offset_bytes = [0u8; OFFSET_SIZE]; + offset.to_bytes(&mut offset_bytes); + + let entry = CompactEntry { + key: ck, + offset: offset_bytes, + size, + }; + + // Match Go panic: don't exceed segment capacity + if self.list.len() >= SEGMENT_CHUNK_SIZE as usize { + panic!( + "attempted to write more than {} entries on CompactMapSegment", + SEGMENT_CHUNK_SIZE + ); + } + + if self.list.len() == SEGMENT_CHUNK_SIZE as usize - 1 { + // Pin capacity to exact size when maxing out + let mut new_list = Vec::with_capacity(SEGMENT_CHUNK_SIZE as usize); + new_list.extend_from_slice(&self.list[..i]); + new_list.push(entry); + new_list.extend_from_slice(&self.list[i..]); + self.list = new_list; + } else { + self.list.insert(i, entry); + } + + if ck < self.first_key { + self.first_key = ck; + } + if ck > self.last_key { + self.last_key = ck; + } + + None + } + + fn get(&self, id: NeedleId) -> Option { + let (i, found) = self.bsearch(id); + if found { + Some(self.list[i].to_needle_value()) + } else { + None + } + } + + /// Mark as deleted by negating size. Returns previous size if not already deleted. + /// Matches Go behavior: checks !IsDeleted() (i.e., size >= 0). + fn delete(&mut self, id: NeedleId) -> Option { + let (i, found) = self.bsearch(id); + if found && !self.list[i].size.is_deleted() { + let old_size = self.list[i].size; + if self.list[i].size.0 == 0 { + self.list[i].size = TOMBSTONE_FILE_SIZE; + } else { + self.list[i].size = Size(-self.list[i].size.0); + } + Some(old_size) + } else { + None + } + } +} + +/// Memory-efficient map of NeedleId -> (Offset, Size). +/// Segments NeedleIds into chunks of 50,000 and stores compact 10-byte entries +/// in sorted arrays, using only 2 bytes for the key within each segment. +pub struct CompactMap { + segments: HashMap, +} + +impl CompactMap { + pub fn new() -> Self { + CompactMap { + segments: HashMap::new(), + } + } + + fn _segment_for_key(&mut self, id: NeedleId) -> &mut Segment { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + self.segments + .entry(chunk) + .or_insert_with(|| Segment::new(chunk)) + } + + /// Insert or update. Returns old NeedleValue if updating. + pub fn set(&mut self, id: NeedleId, offset: Offset, size: Size) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + let segment = self + .segments + .entry(chunk) + .or_insert_with(|| Segment::new(chunk)); + segment.set(id, offset, size) + } + + pub fn get(&self, id: NeedleId) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + self.segments.get(&chunk)?.get(id) + } + + /// Mark as deleted. Returns previous size if was valid. + pub fn delete(&mut self, id: NeedleId) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + self.segments.get_mut(&chunk)?.delete(id) + } + + /// Remove entry entirely (used during idx loading). + pub fn remove(&mut self, id: NeedleId) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + let segment = self.segments.get_mut(&chunk)?; + let (i, found) = segment.bsearch(id); + if found { + let entry = segment.list.remove(i); + // Update first/last keys + if segment.list.is_empty() { + segment.first_key = u16::MAX; + segment.last_key = 0; + } else { + segment.first_key = segment.list[0].key; + segment.last_key = segment.list[segment.list.len() - 1].key; + } + Some(entry.to_needle_value()) + } else { + None + } + } + + /// Iterate all entries in ascending NeedleId order. + pub fn ascending_visit(&self, mut f: F) -> Result<(), E> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), E>, + { + let mut chunks: Vec = self.segments.keys().copied().collect(); + chunks.sort_unstable(); + + for chunk in chunks { + let segment = &self.segments[&chunk]; + for entry in &segment.list { + let id = NeedleId(SEGMENT_CHUNK_SIZE * segment.chunk + entry.key as u64); + let nv = entry.to_needle_value(); + f(id, &nv)?; + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn offset(v: u32) -> Offset { + Offset::from_actual_offset(v as i64 * NEEDLE_PADDING_SIZE as i64) + } + + #[test] + fn test_compact_map_basic() { + let mut m = CompactMap::new(); + + // Insert + assert!(m.set(NeedleId(1), offset(100), Size(50)).is_none()); + assert!(m.set(NeedleId(2), offset(200), Size(60)).is_none()); + + // Get + let nv = m.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, Size(50)); + + // Update returns old value + let old = m.set(NeedleId(1), offset(300), Size(70)).unwrap(); + assert_eq!(old.size, Size(50)); + + // Get updated value + let nv = m.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, Size(70)); + + // Miss + assert!(m.get(NeedleId(999)).is_none()); + } + + #[test] + fn test_compact_map_delete() { + let mut m = CompactMap::new(); + m.set(NeedleId(1), offset(100), Size(50)); + + // Delete returns old size + let old = m.delete(NeedleId(1)).unwrap(); + assert_eq!(old, Size(50)); + + // Get returns deleted (negative size) + let nv = m.get(NeedleId(1)).unwrap(); + assert!(nv.size.is_deleted()); + + // Delete again returns None (already deleted) + assert!(m.delete(NeedleId(1)).is_none()); + } + + #[test] + fn test_compact_map_zero_size_delete() { + let mut m = CompactMap::new(); + m.set(NeedleId(1), offset(100), Size(0)); + + let old = m.delete(NeedleId(1)).unwrap(); + assert_eq!(old, Size(0)); + + let nv = m.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, TOMBSTONE_FILE_SIZE); + } + + #[test] + fn test_compact_map_cross_segment() { + let mut m = CompactMap::new(); + + // Insert across multiple segments + m.set(NeedleId(1), offset(1), Size(1)); + m.set(NeedleId(50_000), offset(2), Size(2)); + m.set(NeedleId(100_000), offset(3), Size(3)); + + assert_eq!(m.get(NeedleId(1)).unwrap().size, Size(1)); + assert_eq!(m.get(NeedleId(50_000)).unwrap().size, Size(2)); + assert_eq!(m.get(NeedleId(100_000)).unwrap().size, Size(3)); + } + + #[test] + fn test_compact_map_ascending_visit() { + let mut m = CompactMap::new(); + m.set(NeedleId(100_005), offset(3), Size(3)); + m.set(NeedleId(5), offset(1), Size(1)); + m.set(NeedleId(50_005), offset(2), Size(2)); + + let mut visited = Vec::new(); + m.ascending_visit(|id, nv| { + visited.push((id, nv.size)); + Ok::<_, String>(()) + }) + .unwrap(); + + assert_eq!(visited.len(), 3); + assert_eq!(visited[0].0, NeedleId(5)); + assert_eq!(visited[1].0, NeedleId(50_005)); + assert_eq!(visited[2].0, NeedleId(100_005)); + } + + #[test] + fn test_compact_map_remove() { + let mut m = CompactMap::new(); + m.set(NeedleId(1), offset(100), Size(50)); + m.set(NeedleId(2), offset(200), Size(60)); + + let removed = m.remove(NeedleId(1)).unwrap(); + assert_eq!(removed.size, Size(50)); + + assert!(m.get(NeedleId(1)).is_none()); + assert_eq!(m.get(NeedleId(2)).unwrap().size, Size(60)); + } + + #[test] + fn test_compact_map_reverse_insert_order() { + let mut m = CompactMap::new(); + // Insert in reverse order to test sorted insert + for i in (0..100).rev() { + m.set(NeedleId(i), offset(i as u32), Size(i as i32)); + } + for i in 0..100 { + assert_eq!(m.get(NeedleId(i)).unwrap().size, Size(i as i32)); + } + } +} diff --git a/seaweed-volume/src/storage/store.rs b/seaweed-volume/src/storage/store.rs new file mode 100644 index 000000000..98ffd3d04 --- /dev/null +++ b/seaweed-volume/src/storage/store.rs @@ -0,0 +1,1297 @@ +//! Store: the top-level storage manager for a volume server. +//! +//! A Store manages multiple DiskLocations (one per configured directory). +//! It coordinates volume placement, lookup, and lifecycle operations. +//! Matches Go's storage/store.go. + +use std::io; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; + +use crate::config::MinFreeSpace; +use crate::pb::master_pb; +use crate::storage::disk_location::DiskLocation; +use crate::storage::erasure_coding::ec_shard::{EcVolumeShard, MAX_SHARD_COUNT}; +use crate::storage::erasure_coding::ec_volume::EcVolume; +use crate::storage::needle::needle::Needle; +use crate::storage::needle_map::NeedleMapKind; +use crate::storage::super_block::ReplicaPlacement; +use crate::storage::types::*; +use crate::storage::volume::{VifVolumeInfo, VolumeError}; + +/// Top-level storage manager containing all disk locations and their volumes. +pub struct Store { + pub locations: Vec, + pub needle_map_kind: NeedleMapKind, + preallocate: AtomicBool, + pub volume_size_limit: AtomicU64, + pub id: String, + pub ip: String, + pub port: u16, + pub grpc_port: u16, + pub public_url: String, + pub data_center: String, + pub rack: String, +} + +impl Store { + pub fn new(needle_map_kind: NeedleMapKind) -> Self { + Store { + locations: Vec::new(), + needle_map_kind, + preallocate: AtomicBool::new(false), + volume_size_limit: AtomicU64::new(0), + id: String::new(), + ip: String::new(), + port: 0, + grpc_port: 0, + public_url: String::new(), + data_center: String::new(), + rack: String::new(), + } + } + + /// Add a disk location and load existing volumes from it. + pub fn add_location( + &mut self, + directory: &str, + idx_directory: &str, + max_volume_count: i32, + disk_type: DiskType, + min_free_space: MinFreeSpace, + tags: Vec, + ) -> io::Result<()> { + let mut loc = DiskLocation::new( + directory, + idx_directory, + max_volume_count, + disk_type, + min_free_space, + tags, + )?; + loc.load_existing_volumes(self.needle_map_kind)?; + + // Check for duplicate volume IDs across existing locations + for vid in loc.volume_ids() { + if self.find_volume(vid).is_some() { + return Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!( + "volume {} already exists in another location, conflicting dir: {}", + vid, directory + ), + )); + } + } + + self.locations.push(loc); + Ok(()) + } + + /// Scan disk locations for new volume files and load them. + /// Mirrors Go's `Store.LoadNewVolumes()`. + pub fn load_new_volumes(&mut self) { + for loc in &mut self.locations { + if let Err(e) = loc.load_existing_volumes(self.needle_map_kind) { + tracing::error!("load_new_volumes error in {}: {}", loc.directory, e); + } + } + } + + // ---- Volume lookup ---- + + /// Find which location contains a volume. + pub fn find_volume(&self, vid: VolumeId) -> Option<(usize, &crate::storage::volume::Volume)> { + for (i, loc) in self.locations.iter().enumerate() { + if let Some(v) = loc.find_volume(vid) { + return Some((i, v)); + } + } + None + } + + /// Find which location contains a volume (mutable). + pub fn find_volume_mut( + &mut self, + vid: VolumeId, + ) -> Option<(usize, &mut crate::storage::volume::Volume)> { + for (i, loc) in self.locations.iter_mut().enumerate() { + if let Some(v) = loc.find_volume_mut(vid) { + return Some((i, v)); + } + } + None + } + + /// Check if a volume exists. + pub fn has_volume(&self, vid: VolumeId) -> bool { + self.find_volume(vid).is_some() + } + + // ---- Volume lifecycle ---- + + /// Find the location with fewest volumes (load-balance) of the given disk type. + /// Matches Go's FindFreeLocation: accounts for EC shards when computing free slots. + fn find_free_location(&self, disk_type: &DiskType) -> Option { + use crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT; + + let mut best: Option<(usize, i64)> = None; // (index, effective_free) + for (i, loc) in self.locations.iter().enumerate() { + if &loc.disk_type != disk_type { + continue; + } + // Go treats MaxVolumeCount == 0 as unlimited (hasFreeDiskLocation) + let max = loc.max_volume_count.load(Ordering::Relaxed) as i64; + let effective_free = if max == 0 { + i64::MAX // unlimited + } else { + // Go formula: currentFreeCount = (MaxVolumeCount - VolumesLen()) * DataShardsCount - EcShardCount() + // currentFreeCount /= DataShardsCount + let free_count = (max - loc.volumes_len() as i64) * DATA_SHARDS_COUNT as i64 + - loc.ec_shard_count() as i64; + free_count / DATA_SHARDS_COUNT as i64 + }; + if effective_free <= 0 { + continue; + } + if loc.is_disk_space_low.load(Ordering::Relaxed) { + continue; + } + if best.is_none() || effective_free > best.unwrap().1 { + best = Some((i, effective_free)); + } + } + best.map(|(i, _)| i) + } + + /// Find a free location matching a predicate. + /// Matches Go's Store.FindFreeLocation: picks the matching location with the + /// most remaining volume capacity, while skipping low-disk locations. + pub fn find_free_location_predicate(&self, pred: F) -> Option + where + F: Fn(&DiskLocation) -> bool, + { + use crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT; + + let mut best: Option<(usize, i64)> = None; + for (i, loc) in self.locations.iter().enumerate() { + if !pred(loc) || loc.is_disk_space_low.load(Ordering::Relaxed) { + continue; + } + + let max = loc.max_volume_count.load(Ordering::Relaxed) as i64; + let effective_free = if max == 0 { + i64::MAX + } else { + let free_count = (max - loc.volumes_len() as i64) * DATA_SHARDS_COUNT as i64 + - loc.ec_shard_count() as i64; + free_count / DATA_SHARDS_COUNT as i64 + }; + if effective_free <= 0 { + continue; + } + + if best.is_none() || effective_free > best.unwrap().1 { + best = Some((i, effective_free)); + } + } + best.map(|(i, _)| i) + } + + /// Create a new volume, placing it on the location with the most free space. + pub fn add_volume( + &mut self, + vid: VolumeId, + collection: &str, + replica_placement: Option, + ttl: Option, + preallocate: u64, + disk_type: DiskType, + version: Version, + ) -> Result<(), VolumeError> { + if self.find_volume(vid).is_some() { + return Err(VolumeError::AlreadyExists); + } + let loc_idx = self.find_free_location(&disk_type).ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!("no free location for disk type {:?}", disk_type), + )) + })?; + + self.locations[loc_idx].create_volume( + vid, + collection, + self.needle_map_kind, + replica_placement, + ttl, + preallocate, + version, + ) + } + + /// Delete a volume from any location. + pub fn delete_volume(&mut self, vid: VolumeId, only_empty: bool) -> Result<(), VolumeError> { + for loc in &mut self.locations { + if loc.find_volume(vid).is_some() { + return loc.delete_volume(vid, only_empty); + } + } + Err(VolumeError::NotFound) + } + + /// Unload (unmount) a volume without deleting its files. + pub fn unmount_volume(&mut self, vid: VolumeId) -> bool { + for loc in &mut self.locations { + if loc.unload_volume(vid).is_some() { + return true; + } + } + false + } + + /// Mount a volume from an existing .dat file. + pub fn mount_volume( + &mut self, + vid: VolumeId, + collection: &str, + disk_type: DiskType, + ) -> Result<(), VolumeError> { + if self.find_volume(vid).is_some() { + return Err(VolumeError::AlreadyExists); + } + // Find the location where the .dat file exists + for loc in &mut self.locations { + if &loc.disk_type != &disk_type { + continue; + } + let base = crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + let dat_path = format!("{}.dat", base); + let vif_path = format!("{}.vif", base); + if std::path::Path::new(&dat_path).exists() || std::path::Path::new(&vif_path).exists() + { + return loc.create_volume( + vid, + collection, + self.needle_map_kind, + None, + None, + 0, + Version::current(), + ); + } + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume {} not found on disk", vid), + ))) + } + + /// Mount a volume by id only (Go's MountVolume behavior). + /// Scans all locations for a matching .dat file and loads with its collection prefix. + pub fn mount_volume_by_id(&mut self, vid: VolumeId) -> Result<(), VolumeError> { + if self.find_volume(vid).is_some() { + return Err(VolumeError::AlreadyExists); + } + if let Some((loc_idx, _base_path, collection)) = self.find_volume_file_base(vid) { + let loc = &mut self.locations[loc_idx]; + return loc.create_volume( + vid, + &collection, + self.needle_map_kind, + None, + None, + 0, + Version::current(), + ); + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume {} not found on disk", vid), + ))) + } + + fn find_volume_file_base(&self, vid: VolumeId) -> Option<(usize, String, String)> { + for (loc_idx, loc) in self.locations.iter().enumerate() { + if let Ok(entries) = std::fs::read_dir(&loc.directory) { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if let Some((collection, file_vid)) = parse_volume_filename(&name) { + if file_vid == vid { + let base = strip_volume_suffix(&name)?; + let base_path = format!("{}/{}", loc.directory, base); + return Some((loc_idx, base_path, collection)); + } + } + } + } + } + None + } + + /// Configure a volume's replica placement on disk. + /// The volume must already be unmounted. This opens the .dat file directly, + /// modifies the replica_placement byte (offset 1), and writes it back. + pub fn configure_volume(&self, vid: VolumeId, rp: ReplicaPlacement) -> Result<(), VolumeError> { + let (_, base_path, _) = self.find_volume_file_base(vid).ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume {} not found on disk", vid), + )) + })?; + let vif_path = format!("{}.vif", base_path); + let mut vif = load_vif_volume_info(&vif_path)?; + vif.replication = rp.to_string(); + save_vif_volume_info(&vif_path, &vif)?; + Ok(()) + } + + // ---- Read / Write / Delete ---- + + /// Read a needle from a volume. + pub fn read_volume_needle(&self, vid: VolumeId, n: &mut Needle) -> Result { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.read_needle(n) + } + + /// Read a needle from a volume, optionally reading deleted needles. + pub fn read_volume_needle_opt( + &self, + vid: VolumeId, + n: &mut Needle, + read_deleted: bool, + ) -> Result { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.read_needle_opt(n, read_deleted) + } + + /// Read needle metadata and return streaming info for large file reads. + pub fn read_volume_needle_stream_info( + &self, + vid: VolumeId, + n: &mut Needle, + read_deleted: bool, + ) -> Result { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.read_needle_stream_info(n, read_deleted) + } + + /// Re-lookup a needle's data-file offset after compaction may have moved it. + /// Returns `(new_data_file_offset, current_compaction_revision)`. + pub fn re_lookup_needle_data_offset( + &self, + vid: VolumeId, + needle_id: NeedleId, + ) -> Result<(u64, u16), VolumeError> { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.re_lookup_needle_data_offset(needle_id) + } + + /// Write a needle to a volume. + pub fn write_volume_needle( + &mut self, + vid: VolumeId, + n: &mut Needle, + ) -> Result<(u64, Size, bool), VolumeError> { + // Check disk space on the location containing this volume. + // We do this before the mutable borrow to avoid borrow conflicts. + let loc_idx = self + .find_volume(vid) + .map(|(i, _)| i) + .ok_or(VolumeError::NotFound)?; + if self.locations[loc_idx] + .is_disk_space_low + .load(Ordering::Relaxed) + { + return Err(VolumeError::ReadOnly); + } + + let (_, vol) = self.find_volume_mut(vid).ok_or(VolumeError::NotFound)?; + vol.write_needle(n, true) + } + + /// Delete a needle from a volume. + pub fn delete_volume_needle( + &mut self, + vid: VolumeId, + n: &mut Needle, + ) -> Result { + // Match Go's DeleteVolumeNeedle: check noWriteOrDelete before proceeding. + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + if vol.is_no_write_or_delete() { + return Err(VolumeError::ReadOnly); + } + + let (_, vol) = self.find_volume_mut(vid).ok_or(VolumeError::NotFound)?; + vol.delete_needle(n) + } + + // ---- Collection operations ---- + + /// Delete all volumes in a collection. + pub fn delete_collection(&mut self, collection: &str) -> Result<(), String> { + for loc in &mut self.locations { + loc.delete_collection(collection) + .map_err(|e| format!("delete collection {}: {}", collection, e))?; + } + crate::metrics::delete_collection_metrics(collection); + Ok(()) + } + + // ---- Metrics ---- + + /// Total volume count across all locations. + pub fn total_volume_count(&self) -> usize { + self.locations.iter().map(|loc| loc.volumes_len()).sum() + } + + pub fn set_preallocate(&self, preallocate: bool) { + self.preallocate.store(preallocate, Ordering::Relaxed); + } + + pub fn get_preallocate(&self) -> bool { + self.preallocate.load(Ordering::Relaxed) + } + + /// Total max volumes across all locations. + pub fn max_volume_count(&self) -> i32 { + self.locations + .iter() + .map(|loc| loc.max_volume_count.load(Ordering::Relaxed)) + .sum() + } + + /// Total EC shard count across all locations. + pub fn ec_shard_count(&self) -> usize { + self.locations.iter().map(|loc| loc.ec_shard_count()).sum() + } + + /// Recalculate max volume counts for locations with original_max_volume_count == 0. + /// Returns true if any max changed (caller should re-send heartbeat). + pub fn maybe_adjust_volume_max(&self) -> bool { + let volume_size_limit = self.volume_size_limit.load(Ordering::Relaxed); + if volume_size_limit == 0 { + return false; + } + + let mut has_changes = false; + let mut new_max_total: i32 = 0; + + for loc in &self.locations { + if loc.original_max_volume_count == 0 { + let current = loc.max_volume_count.load(Ordering::Relaxed); + let (_, free) = super::disk_location::get_disk_stats(&loc.directory); + + let unused_space = if self.get_preallocate() { + 0 + } else { + loc.unused_space(volume_size_limit) + }; + let unclaimed = (free as i64) - (unused_space as i64); + + let vol_count = loc.volumes_len() as i32; + let loc_ec_shards = loc.ec_shard_count(); + let ec_equivalent = ((loc_ec_shards + + crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT) + / crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT) + as i32; + let mut max_count = vol_count + ec_equivalent; + + if unclaimed > volume_size_limit as i64 { + max_count += (unclaimed as u64 / volume_size_limit) as i32 - 1; + } + + loc.max_volume_count.store(max_count, Ordering::Relaxed); + new_max_total += max_count; + has_changes = has_changes || current != max_count; + } else { + new_max_total += loc.original_max_volume_count; + } + } + + crate::metrics::MAX_VOLUMES.set(new_max_total as i64); + has_changes + } + + /// Free volume slots across all locations. + pub fn free_volume_count(&self) -> i32 { + self.locations + .iter() + .map(|loc| loc.free_volume_count()) + .sum() + } + + /// All volume IDs across all locations. + pub fn all_volume_ids(&self) -> Vec { + let mut ids: Vec = self + .locations + .iter() + .flat_map(|loc| loc.volume_ids()) + .collect(); + ids.sort(); + ids.dedup(); + ids + } + + // ---- EC volume operations ---- + + /// Mount EC shards for a volume (batch). + pub fn mount_ec_shards( + &mut self, + vid: VolumeId, + collection: &str, + shard_ids: &[u32], + ) -> Result<(), VolumeError> { + // Find the location where the EC files live + let loc_idx = self.find_ec_location(vid, collection).ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("ec volume {} shards not found on disk", vid), + )) + })?; + + self.locations[loc_idx].mount_ec_shards(vid, collection, shard_ids) + } + + /// Mount a single EC shard, searching all locations for the shard file. + /// Matches Go's Store.MountEcShards which mounts one shard at a time. + pub fn mount_ec_shard( + &mut self, + vid: VolumeId, + collection: &str, + shard_id: u32, + ) -> Result<(), VolumeError> { + for loc in &mut self.locations { + // Check if the shard file exists on this location + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id as u8); + if std::path::Path::new(&shard.file_name()).exists() { + loc.mount_ec_shards(vid, collection, &[shard_id])?; + return Ok(()); + } + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("MountEcShards {}.{} not found on disk", vid, shard_id), + ))) + } + + /// Unmount EC shards for a volume (batch). + pub fn unmount_ec_shards(&mut self, vid: VolumeId, shard_ids: &[u32]) { + for loc in &mut self.locations { + if loc.has_ec_volume(vid) { + loc.unmount_ec_shards(vid, shard_ids); + return; + } + } + } + + /// Unmount a single EC shard, searching all locations. + /// Matches Go's Store.UnmountEcShards which unmounts one shard at a time. + pub fn unmount_ec_shard(&mut self, vid: VolumeId, shard_id: u32) -> Result<(), VolumeError> { + for loc in &mut self.locations { + if loc.has_ec_volume(vid) { + loc.unmount_ec_shards(vid, &[shard_id]); + return Ok(()); + } + } + // Go returns nil if shard not found (no error) + Ok(()) + } + + /// Find an EC volume across all locations. + pub fn find_ec_volume(&self, vid: VolumeId) -> Option<&EcVolume> { + for loc in &self.locations { + if let Some(ecv) = loc.find_ec_volume(vid) { + return Some(ecv); + } + } + None + } + + /// Find an EC volume across all locations (mutable). + pub fn find_ec_volume_mut(&mut self, vid: VolumeId) -> Option<&mut EcVolume> { + for loc in &mut self.locations { + if let Some(ecv) = loc.find_ec_volume_mut(vid) { + return Some(ecv); + } + } + None + } + + /// Check if any location has an EC volume. + pub fn has_ec_volume(&self, vid: VolumeId) -> bool { + self.locations.iter().any(|loc| loc.has_ec_volume(vid)) + } + + pub fn delete_expired_ec_volumes( + &mut self, + ) -> ( + Vec, + Vec, + ) { + let mut ec_shards = Vec::new(); + let mut deleted = Vec::new(); + + for (disk_id, loc) in self.locations.iter_mut().enumerate() { + let mut expired_vids = Vec::new(); + for (vid, ec_vol) in loc.ec_volumes() { + if ec_vol.is_time_to_destroy() { + expired_vids.push(*vid); + } else { + ec_shards + .extend(ec_vol.to_volume_ec_shard_information_messages(disk_id as u32)); + } + } + + for vid in expired_vids { + let messages = loc + .find_ec_volume(vid) + .map(|ec_vol| ec_vol.to_volume_ec_shard_information_messages(disk_id as u32)) + .unwrap_or_default(); + if let Some(mut ec_vol) = loc.remove_ec_volume(vid) { + for _ in 0..ec_vol.shard_count() { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&ec_vol.collection, "ec_shards"]) + .dec(); + } + ec_vol.destroy(); + deleted.extend(messages); + } else { + ec_shards.extend(messages); + } + } + } + + (ec_shards, deleted) + } + + /// Remove an EC volume from whichever location has it. + pub fn remove_ec_volume(&mut self, vid: VolumeId) -> Option { + for loc in &mut self.locations { + if let Some(ecv) = loc.remove_ec_volume(vid) { + return Some(ecv); + } + } + None + } + + /// Find the location index containing EC files for a volume. + pub fn find_ec_location(&self, vid: VolumeId, collection: &str) -> Option { + for (i, loc) in self.locations.iter().enumerate() { + let base = crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + let ecx_path = format!("{}.ecx", base); + if std::path::Path::new(&ecx_path).exists() { + return Some(i); + } + } + None + } + + /// Delete EC shard files from disk. + pub fn delete_ec_shards(&mut self, vid: VolumeId, collection: &str, shard_ids: &[u32]) { + // Delete shard files from disk + for loc in &self.locations { + for &shard_id in shard_ids { + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id as u8); + let path = shard.file_name(); + let _ = std::fs::remove_file(&path); + } + } + + // Also unmount if mounted + self.unmount_ec_shards(vid, shard_ids); + + // If all shards are gone, remove .ecx and .ecj files from both idx and data dirs + let all_gone = self.check_all_ec_shards_deleted(vid, collection); + if all_gone { + for loc in &self.locations { + let idx_base = + crate::storage::volume::volume_file_name(&loc.idx_directory, collection, vid); + let _ = std::fs::remove_file(format!("{}.ecx", idx_base)); + let _ = std::fs::remove_file(format!("{}.ecj", idx_base)); + // Also try data directory in case .ecx/.ecj were created before -dir.idx + if loc.idx_directory != loc.directory { + let data_base = + crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + let _ = std::fs::remove_file(format!("{}.ecx", data_base)); + let _ = std::fs::remove_file(format!("{}.ecj", data_base)); + } + } + } + } + + /// Check if all EC shard files have been deleted for a volume. + /// Uses MAX_SHARD_COUNT to support non-standard EC configurations. + fn check_all_ec_shards_deleted(&self, vid: VolumeId, collection: &str) -> bool { + for loc in &self.locations { + for shard_id in 0..MAX_SHARD_COUNT as u8 { + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id); + if std::path::Path::new(&shard.file_name()).exists() { + return false; + } + } + } + true + } + + /// Find the directory containing EC files for a volume. + pub fn find_ec_dir(&self, vid: VolumeId, collection: &str) -> Option { + for loc in &self.locations { + // Check idx directory first + let idx_base = + crate::storage::volume::volume_file_name(&loc.idx_directory, collection, vid); + if std::path::Path::new(&format!("{}.ecx", idx_base)).exists() { + return Some(loc.directory.clone()); + } + // Fall back to data directory if .ecx was created before -dir.idx was configured + if loc.idx_directory != loc.directory { + let data_base = + crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + if std::path::Path::new(&format!("{}.ecx", data_base)).exists() { + return Some(loc.directory.clone()); + } + } + } + None + } + + /// Find the directory containing a specific EC shard file. + pub fn find_ec_shard_dir( + &self, + vid: VolumeId, + collection: &str, + shard_id: u8, + ) -> Option { + for loc in &self.locations { + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id); + if std::path::Path::new(&shard.file_name()).exists() { + return Some(loc.directory.clone()); + } + } + None + } + + // ---- Vacuum / Compaction ---- + + /// Check the garbage level of a volume. + pub fn check_compact_volume(&self, vid: VolumeId) -> Result { + if let Some((_, v)) = self.find_volume(vid) { + Ok(v.garbage_level()) + } else { + Err(format!( + "volume id {} is not found during check compact", + vid.0 + )) + } + } + + /// Compact a volume by rewriting only live needles. + pub fn compact_volume( + &mut self, + vid: VolumeId, + preallocate: u64, + max_bytes_per_second: i64, + progress_fn: F, + ) -> Result<(), String> + where + F: Fn(i64) -> bool, + { + let loc_idx = self + .find_volume(vid) + .map(|(i, _)| i) + .ok_or_else(|| format!("volume id {} is not found during compact", vid.0))?; + + let dir = self.locations[loc_idx].directory.clone(); + let (_, free) = crate::storage::disk_location::get_disk_stats(&dir); + + // Compute required space: use the larger of preallocate or estimated volume size + // matching Go's CompactVolume space check + let space_needed = { + let (_, v) = self.find_volume(vid).unwrap(); + let estimated = v.dat_file_size().unwrap_or(0) + v.idx_file_size(); + std::cmp::max(preallocate, estimated) + }; + + if free < space_needed { + return Err(format!( + "not enough free space to compact volume {}. Required: {}, Free: {}", + vid.0, space_needed, free + )); + } + + if let Some((_, v)) = self.find_volume_mut(vid) { + v.compact_by_index(preallocate, max_bytes_per_second, progress_fn) + .map_err(|e| format!("compact volume {}: {}", vid.0, e)) + } else { + Err(format!("volume id {} is not found during compact", vid.0)) + } + } + + /// Commit a completed compaction: swap files and reload. + pub fn commit_compact_volume(&mut self, vid: VolumeId) -> Result<(bool, u64), String> { + if let Some((_, v)) = self.find_volume_mut(vid) { + let is_read_only = v.is_read_only(); + v.commit_compact() + .map_err(|e| format!("commit compact volume {}: {}", vid.0, e))?; + let volume_size = v.dat_file_size().unwrap_or(0); + Ok((is_read_only, volume_size)) + } else { + Err(format!( + "volume id {} is not found during commit compact", + vid.0 + )) + } + } + + /// Clean up leftover compaction files. + pub fn cleanup_compact_volume(&mut self, vid: VolumeId) -> Result<(), String> { + if let Some((_, v)) = self.find_volume_mut(vid) { + v.cleanup_compact() + .map_err(|e| format!("cleanup volume {}: {}", vid.0, e)) + } else { + Err(format!( + "volume id {} is not found during cleaning up", + vid.0 + )) + } + } + + /// Close all locations and their volumes. + pub fn close(&mut self) { + for loc in &mut self.locations { + loc.close(); + } + } +} + +/// Parse a volume filename like "collection_42.dat" or "42.dat" into (collection, VolumeId). +fn parse_volume_filename(filename: &str) -> Option<(String, VolumeId)> { + let stem = strip_volume_suffix(filename)?; + if let Some(pos) = stem.rfind('_') { + let collection = &stem[..pos]; + let id_str = &stem[pos + 1..]; + let id: u32 = id_str.parse().ok()?; + Some((collection.to_string(), VolumeId(id))) + } else { + let id: u32 = stem.parse().ok()?; + Some((String::new(), VolumeId(id))) + } +} + +fn strip_volume_suffix(filename: &str) -> Option<&str> { + filename + .strip_suffix(".dat") + .or_else(|| filename.strip_suffix(".vif")) + .or_else(|| filename.strip_suffix(".idx")) +} + +fn load_vif_volume_info(path: &str) -> Result { + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(VifVolumeInfo::default()), + Err(e) => return Err(VolumeError::Io(e)), + }; + if content.trim().is_empty() { + return Ok(VifVolumeInfo::default()); + } + if let Ok(vif) = serde_json::from_str::(&content) { + return Ok(vif); + } + #[derive(serde::Deserialize)] + struct LegacyVolumeInfo { + read_only: bool, + } + if let Ok(legacy) = serde_json::from_str::(&content) { + let mut vif = VifVolumeInfo::default(); + vif.read_only = legacy.read_only; + return Ok(vif); + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!("invalid volume info file {}", path), + ))) +} + +fn save_vif_volume_info(path: &str, info: &VifVolumeInfo) -> Result<(), VolumeError> { + let content = serde_json::to_string_pretty(info) + .map_err(|e| VolumeError::Io(io::Error::new(io::ErrorKind::Other, e.to_string())))?; + std::fs::write(path, content)?; + Ok(()) +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::needle::needle::Needle; + use tempfile::TempDir; + + fn make_test_store(dirs: &[&str]) -> Store { + let mut store = Store::new(NeedleMapKind::InMemory); + for dir in dirs { + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + } + store + } + + #[test] + fn test_store_add_location() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + assert_eq!(store.locations.len(), 1); + assert_eq!(store.max_volume_count(), 10); + } + + #[test] + fn test_store_add_volume() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + assert!(store.has_volume(VolumeId(1))); + assert!(!store.has_volume(VolumeId(2))); + assert_eq!(store.total_volume_count(), 1); + } + + #[test] + fn test_store_read_write_delete() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + // Write + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"hello store".to_vec(), + data_size: 11, + ..Needle::default() + }; + let (offset, _size, unchanged) = store.write_volume_needle(VolumeId(1), &mut n).unwrap(); + assert!(!unchanged); + assert!(offset > 0); + + // Read + let mut read_n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let count = store.read_volume_needle(VolumeId(1), &mut read_n).unwrap(); + assert_eq!(count, 11); + assert_eq!(read_n.data, b"hello store"); + + // Delete + let mut del_n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + ..Needle::default() + }; + let deleted = store.delete_volume_needle(VolumeId(1), &mut del_n).unwrap(); + assert!(deleted.0 > 0); + } + + #[test] + fn test_store_multi_location() { + let tmp1 = TempDir::new().unwrap(); + let tmp2 = TempDir::new().unwrap(); + let dir1 = tmp1.path().to_str().unwrap(); + let dir2 = tmp2.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir1, + dir1, + 5, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_location( + dir2, + dir2, + 5, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + assert_eq!(store.max_volume_count(), 10); + + // Add volumes — should go to location with fewest volumes + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(2), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + assert_eq!(store.total_volume_count(), 2); + // Both locations should have 1 volume each (load-balanced) + assert_eq!(store.locations[0].volumes_len(), 1); + assert_eq!(store.locations[1].volumes_len(), 1); + } + + #[test] + fn test_store_delete_collection() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + + store + .add_volume( + VolumeId(1), + "pics", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(2), + "pics", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(3), + "docs", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + assert_eq!(store.total_volume_count(), 3); + + store.delete_collection("pics").unwrap(); + assert_eq!(store.total_volume_count(), 1); + assert!(store.has_volume(VolumeId(3))); + } + + #[test] + fn test_maybe_adjust_volume_max_honors_preallocate_flag() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 2, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store.volume_size_limit.store(1024, Ordering::Relaxed); + store + .add_volume( + VolumeId(61), + "preallocate_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(62), + "preallocate_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + for vid in [VolumeId(61), VolumeId(62)] { + let dat_path = store.find_volume(vid).unwrap().1.dat_path(); + std::fs::OpenOptions::new() + .write(true) + .open(dat_path) + .unwrap() + .set_len((crate::storage::super_block::SUPER_BLOCK_SIZE + 1) as u64) + .unwrap(); + } + store.locations[0].original_max_volume_count = 0; + store.locations[0] + .max_volume_count + .store(0, Ordering::Relaxed); + + store.set_preallocate(false); + assert!(store.maybe_adjust_volume_max()); + let without_preallocate = store.locations[0].max_volume_count.load(Ordering::Relaxed); + + store.set_preallocate(true); + assert!(store.maybe_adjust_volume_max()); + let with_preallocate = store.locations[0].max_volume_count.load(Ordering::Relaxed); + + assert!(with_preallocate > without_preallocate); + } + + #[test] + fn test_find_free_location_predicate_prefers_more_capacity_and_skips_low_disk() { + let tmp1 = TempDir::new().unwrap(); + let dir1 = tmp1.path().to_str().unwrap(); + let tmp2 = TempDir::new().unwrap(); + let dir2 = tmp2.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir1, + dir1, + 3, + DiskType::HardDrive, + MinFreeSpace::Percent(0.0), + Vec::new(), + ) + .unwrap(); + store + .add_location( + dir2, + dir2, + 5, + DiskType::HardDrive, + MinFreeSpace::Percent(0.0), + Vec::new(), + ) + .unwrap(); + + store + .add_volume( + VolumeId(71), + "find_free_location_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + let selected = + store.find_free_location_predicate(|loc| loc.disk_type == DiskType::HardDrive); + assert_eq!(selected, Some(1)); + + store.locations[1] + .is_disk_space_low + .store(true, Ordering::Relaxed); + + let selected = + store.find_free_location_predicate(|loc| loc.disk_type == DiskType::HardDrive); + assert_eq!(selected, Some(0)); + } + + #[test] + fn test_delete_expired_ec_volumes_removes_expired_entries() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + + std::fs::write(format!("{}/expired_ec_case_9.ec00", dir), b"expired").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(9), "expired_ec_case", &[0]) + .unwrap(); + store.find_ec_volume_mut(VolumeId(9)).unwrap().expire_at_sec = 1; + + let (ec_shards, deleted) = store.delete_expired_ec_volumes(); + + assert!(ec_shards.is_empty()); + assert_eq!(deleted.len(), 1); + assert_eq!(deleted[0].id, 9); + assert!(!store.has_ec_volume(VolumeId(9))); + assert!(!std::path::Path::new(&format!("{}/expired_ec_case_9.ec00", dir)).exists()); + } + + #[test] + fn test_store_volume_not_found() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let store = make_test_store(&[dir]); + + let mut n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let err = store.read_volume_needle(VolumeId(99), &mut n); + assert!(matches!(err, Err(VolumeError::NotFound))); + } +} diff --git a/seaweed-volume/src/storage/super_block.rs b/seaweed-volume/src/storage/super_block.rs new file mode 100644 index 000000000..033d1a929 --- /dev/null +++ b/seaweed-volume/src/storage/super_block.rs @@ -0,0 +1,289 @@ +//! SuperBlock: the 8-byte (+ optional extra) header at the start of every .dat file. +//! +//! Byte layout: +//! [0] Version +//! [1] ReplicaPlacement byte +//! [2..4] TTL (2 bytes) +//! [4..6] CompactionRevision (u16 big-endian) +//! [6..8] ExtraSize (u16 big-endian) +//! [8..] Extra data (protobuf, ExtraSize bytes) — only for Version 2/3 + +use crate::storage::needle::ttl::TTL; +use crate::storage::types::Version; + +pub const SUPER_BLOCK_SIZE: usize = 8; + +/// SuperBlock metadata at the start of a volume .dat file. +#[derive(Debug, Clone)] +pub struct SuperBlock { + pub version: Version, + pub replica_placement: ReplicaPlacement, + pub ttl: TTL, + pub compaction_revision: u16, + pub extra_size: u16, + pub extra_data: Vec, // raw protobuf bytes (SuperBlockExtra) +} + +impl SuperBlock { + /// Total block size on disk (base 8 + extra). + pub fn block_size(&self) -> usize { + match self.version.0 { + 2 | 3 => SUPER_BLOCK_SIZE + self.extra_size as usize, + _ => SUPER_BLOCK_SIZE, + } + } + + /// Serialize to bytes. + pub fn to_bytes(&self) -> Vec { + let mut header = vec![0u8; SUPER_BLOCK_SIZE]; + header[0] = self.version.0; + header[1] = self.replica_placement.to_byte(); + self.ttl.to_bytes(&mut header[2..4]); + header[4..6].copy_from_slice(&self.compaction_revision.to_be_bytes()); + + if !self.extra_data.is_empty() { + // Go checks extraSize > 256*256-2 and calls glog.Fatalf; guard against u16 overflow. + assert!( + self.extra_data.len() <= 65534, + "super block extra data too large: {} > 65534", + self.extra_data.len() + ); + let extra_size = self.extra_data.len() as u16; + header[6..8].copy_from_slice(&extra_size.to_be_bytes()); + header.extend_from_slice(&self.extra_data); + } + + header + } + + /// Parse from bytes (must be at least SUPER_BLOCK_SIZE bytes). + pub fn from_bytes(bytes: &[u8]) -> Result { + if bytes.len() < SUPER_BLOCK_SIZE { + return Err(SuperBlockError::TooShort(bytes.len())); + } + + let version = Version(bytes[0]); + let replica_placement = ReplicaPlacement::from_byte(bytes[1])?; + let ttl = TTL::from_bytes(&bytes[2..4]); + let compaction_revision = u16::from_be_bytes([bytes[4], bytes[5]]); + let extra_size = u16::from_be_bytes([bytes[6], bytes[7]]); + + let extra_data = if extra_size > 0 && bytes.len() >= SUPER_BLOCK_SIZE + extra_size as usize + { + bytes[SUPER_BLOCK_SIZE..SUPER_BLOCK_SIZE + extra_size as usize].to_vec() + } else { + vec![] + }; + + Ok(SuperBlock { + version, + replica_placement, + ttl, + compaction_revision, + extra_size, + extra_data, + }) + } + + pub fn initialized(&self) -> bool { + true // ReplicaPlacement and TTL are always valid after construction + } +} + +impl Default for SuperBlock { + fn default() -> Self { + SuperBlock { + version: Version::current(), + replica_placement: ReplicaPlacement::default(), + ttl: TTL::EMPTY, + compaction_revision: 0, + extra_size: 0, + extra_data: vec![], + } + } +} + +// ============================================================================ +// ReplicaPlacement +// ============================================================================ + +/// Replication strategy encoded as a single byte. +/// +/// Byte value = DiffDataCenterCount * 100 + DiffRackCount * 10 + SameRackCount +/// +/// Examples: +/// "000" → no replication (1 copy total) +/// "010" → 1 copy in different rack (2 copies total) +/// "100" → 1 copy in different datacenter +/// "200" → 2 copies in different datacenters +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct ReplicaPlacement { + pub same_rack_count: u8, + pub diff_rack_count: u8, + pub diff_data_center_count: u8, +} + +impl ReplicaPlacement { + /// Parse from a string like "000", "010", "100". + /// Accepts 0-3 character strings, padding with leading zeros to match Go behavior. + /// E.g. "" -> "000", "1" -> "001", "01" -> "001", "010" -> "010" + pub fn from_string(s: &str) -> Result { + let s = s.trim(); + if s.is_empty() { + return Ok(ReplicaPlacement::default()); + } + // Pad with leading zeros to 3 chars, matching Go's NewReplicaPlacementFromString + let padded = match s.len() { + 1 => format!("00{}", s), + 2 => format!("0{}", s), + 3 => s.to_string(), + _ => return Err(SuperBlockError::InvalidReplicaPlacement(s.to_string())), + }; + let chars: Vec = padded.chars().collect(); + let dc = chars[0] + .to_digit(10) + .ok_or_else(|| SuperBlockError::InvalidReplicaPlacement(s.to_string()))? + as u8; + let rack = chars[1] + .to_digit(10) + .ok_or_else(|| SuperBlockError::InvalidReplicaPlacement(s.to_string()))? + as u8; + let same = chars[2] + .to_digit(10) + .ok_or_else(|| SuperBlockError::InvalidReplicaPlacement(s.to_string()))? + as u8; + // Go validates: value = dc*100 + rack*10 + same must fit in a byte + let value = dc as u16 * 100 + rack as u16 * 10 + same as u16; + if value > 255 { + return Err(SuperBlockError::InvalidReplicaPlacement(s.to_string())); + } + Ok(ReplicaPlacement { + diff_data_center_count: dc, + diff_rack_count: rack, + same_rack_count: same, + }) + } + + /// Parse from a single byte. + pub fn from_byte(b: u8) -> Result { + Ok(ReplicaPlacement { + diff_data_center_count: b / 100, + diff_rack_count: (b % 100) / 10, + same_rack_count: b % 10, + }) + } + + /// Encode as a single byte. + pub fn to_byte(&self) -> u8 { + self.diff_data_center_count * 100 + self.diff_rack_count * 10 + self.same_rack_count + } + + /// Total number of copies (including the original). + pub fn get_copy_count(&self) -> u8 { + self.diff_data_center_count + self.diff_rack_count + self.same_rack_count + 1 + } + + /// Whether this placement requires replication (more than 1 copy). + pub fn has_replication(&self) -> bool { + self.get_copy_count() > 1 + } +} + +impl std::fmt::Display for ReplicaPlacement { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}{}{}", + self.diff_data_center_count, self.diff_rack_count, self.same_rack_count + ) + } +} + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum SuperBlockError { + #[error("super block too short: {0} bytes")] + TooShort(usize), + + #[error("invalid replica placement: {0}")] + InvalidReplicaPlacement(String), +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::types::*; + + #[test] + fn test_super_block_round_trip() { + let sb = SuperBlock { + version: VERSION_3, + replica_placement: ReplicaPlacement::from_string("010").unwrap(), + ttl: TTL { count: 5, unit: 3 }, + compaction_revision: 42, + extra_size: 0, + extra_data: vec![], + }; + + let bytes = sb.to_bytes(); + assert_eq!(bytes.len(), SUPER_BLOCK_SIZE); + + let sb2 = SuperBlock::from_bytes(&bytes).unwrap(); + assert_eq!(sb2.version, sb.version); + assert_eq!(sb2.replica_placement, sb.replica_placement); + assert_eq!(sb2.ttl, sb.ttl); + assert_eq!(sb2.compaction_revision, sb.compaction_revision); + } + + #[test] + fn test_super_block_with_extra() { + let sb = SuperBlock { + version: VERSION_3, + replica_placement: ReplicaPlacement::default(), + ttl: TTL::EMPTY, + compaction_revision: 0, + extra_size: 3, + extra_data: vec![1, 2, 3], + }; + + let bytes = sb.to_bytes(); + assert_eq!(bytes.len(), SUPER_BLOCK_SIZE + 3); + + let sb2 = SuperBlock::from_bytes(&bytes).unwrap(); + assert_eq!(sb2.extra_data, vec![1, 2, 3]); + } + + #[test] + fn test_replica_placement_byte_round_trip() { + let rp = ReplicaPlacement::from_string("123").unwrap(); + assert_eq!(rp.diff_data_center_count, 1); + assert_eq!(rp.diff_rack_count, 2); + assert_eq!(rp.same_rack_count, 3); + assert_eq!(rp.to_byte(), 123); + assert_eq!(rp.get_copy_count(), 7); // 1+2+3+1 + + let rp2 = ReplicaPlacement::from_byte(123).unwrap(); + assert_eq!(rp, rp2); + } + + #[test] + fn test_replica_placement_no_replication() { + let rp = ReplicaPlacement::from_string("000").unwrap(); + assert!(!rp.has_replication()); + assert_eq!(rp.get_copy_count(), 1); + } + + #[test] + fn test_replica_placement_display() { + let rp = ReplicaPlacement::from_string("010").unwrap(); + assert_eq!(rp.to_string(), "010"); + assert!(rp.has_replication()); + } +} diff --git a/seaweed-volume/src/storage/types.rs b/seaweed-volume/src/storage/types.rs new file mode 100644 index 000000000..c75d35ec1 --- /dev/null +++ b/seaweed-volume/src/storage/types.rs @@ -0,0 +1,679 @@ +//! Core storage types: NeedleId, Offset, Size, Cookie, DiskType. +//! +//! These types define the binary-compatible on-disk format matching the Go implementation. +//! CRITICAL: Byte layout must match exactly for cross-compatibility. + +use std::fmt; + +// ============================================================================ +// Constants +// ============================================================================ + +pub const NEEDLE_ID_SIZE: usize = 8; +pub const NEEDLE_ID_EMPTY: u64 = 0; +pub const COOKIE_SIZE: usize = 4; +pub const SIZE_SIZE: usize = 4; +pub const NEEDLE_HEADER_SIZE: usize = COOKIE_SIZE + NEEDLE_ID_SIZE + SIZE_SIZE; // 16 +pub const DATA_SIZE_SIZE: usize = 4; +pub const TIMESTAMP_SIZE: usize = 8; +pub const NEEDLE_PADDING_SIZE: usize = 8; +pub const NEEDLE_CHECKSUM_SIZE: usize = 4; + +/// 5-byte offset mode (matching Go production builds with `-tags 5BytesOffset`). +/// Max volume size: 8TB. Index entry: 17 bytes (8 + 5 + 4). +#[cfg(feature = "5bytes")] +pub const OFFSET_SIZE: usize = 5; +#[cfg(feature = "5bytes")] +pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8 * 256; // 8TB + +/// 4-byte offset mode (matching Go default build without `5BytesOffset`). +/// Max volume size: 32GB. Index entry: 16 bytes (8 + 4 + 4). +#[cfg(not(feature = "5bytes"))] +pub const OFFSET_SIZE: usize = 4; +#[cfg(not(feature = "5bytes"))] +pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8; // 32GB + +pub const NEEDLE_MAP_ENTRY_SIZE: usize = NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE; + +// ============================================================================ +// NeedleId +// ============================================================================ + +/// 64-bit unique identifier for a needle within a volume. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct NeedleId(pub u64); + +impl NeedleId { + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= NEEDLE_ID_SIZE); + bytes[0..8].copy_from_slice(&self.0.to_be_bytes()); + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= NEEDLE_ID_SIZE); + NeedleId(u64::from_be_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ])) + } + + pub fn is_empty(&self) -> bool { + self.0 == 0 + } + + /// Parse a hex string into a NeedleId. + pub fn parse(s: &str) -> Result { + u64::from_str_radix(s, 16).map(NeedleId) + } +} + +impl fmt::Display for NeedleId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:x}", self.0) + } +} + +impl From for NeedleId { + fn from(v: u64) -> Self { + NeedleId(v) + } +} + +impl From for u64 { + fn from(v: NeedleId) -> Self { + v.0 + } +} + +// ============================================================================ +// Cookie +// ============================================================================ + +/// Random 32-bit value to mitigate brute-force lookups. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub struct Cookie(pub u32); + +impl Cookie { + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= COOKIE_SIZE); + bytes[0..4].copy_from_slice(&self.0.to_be_bytes()); + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= COOKIE_SIZE); + Cookie(u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])) + } + + /// Parse a hex string into a Cookie. + pub fn parse(s: &str) -> Result { + u32::from_str_radix(s, 16).map(Cookie) + } +} + +impl fmt::Display for Cookie { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:x}", self.0) + } +} + +impl From for Cookie { + fn from(v: u32) -> Self { + Cookie(v) + } +} + +// ============================================================================ +// Size +// ============================================================================ + +/// Needle size as stored in the index. Negative = deleted. +/// +/// - Positive: valid needle with that many bytes of body content +/// - TombstoneFileSize (-1): tombstone marker +/// - Other negative: deleted, absolute value was the original size +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub struct Size(pub i32); + +/// Special marker for a tombstone (deletion marker) entry. +pub const TOMBSTONE_FILE_SIZE: Size = Size(-1); + +impl Size { + pub fn is_tombstone(&self) -> bool { + self.0 == TOMBSTONE_FILE_SIZE.0 + } + + pub fn is_deleted(&self) -> bool { + self.0 < 0 || self.0 == TOMBSTONE_FILE_SIZE.0 + } + + pub fn is_valid(&self) -> bool { + self.0 > 0 && !self.is_tombstone() + } + + /// Raw storage size. For tombstones returns 0; for negative returns abs value. + pub fn raw(&self) -> u32 { + if self.is_tombstone() { + return 0; + } + if self.0 < 0 { + return (self.0 * -1) as u32; + } + self.0 as u32 + } + + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= SIZE_SIZE); + bytes[0..4].copy_from_slice(&(self.0 as u32).to_be_bytes()); + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= SIZE_SIZE); + let v = u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); + Size(v as i32) + } +} + +impl From for Size { + fn from(v: i32) -> Self { + Size(v) + } +} + +impl From for i32 { + fn from(v: Size) -> Self { + v.0 + } +} + +// ============================================================================ +// Offset +// ============================================================================ + +/// Offset encoding for needle positions in .dat files. +/// +/// The offset is stored divided by NEEDLE_PADDING_SIZE (8). +/// +/// With `5bytes` feature (default, matching Go production builds): +/// 5 bytes can address up to 8TB. +/// On-disk layout: [b3][b2][b1][b0][b4] (big-endian 4 bytes + 1 high byte) +/// +/// Without `5bytes` feature (matching Go default build): +/// 4 bytes can address up to 32GB. +/// On-disk layout: [b3][b2][b1][b0] (big-endian 4 bytes) +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] +pub struct Offset { + pub b0: u8, + pub b1: u8, + pub b2: u8, + pub b3: u8, + #[cfg(feature = "5bytes")] + pub b4: u8, +} + +impl Offset { + /// Convert to the actual byte offset in the .dat file. + pub fn to_actual_offset(&self) -> i64 { + let stored = self.b0 as i64 + + (self.b1 as i64) * 256 + + (self.b2 as i64) * 65536 + + (self.b3 as i64) * 16777216; + #[cfg(feature = "5bytes")] + let stored = stored + (self.b4 as i64) * 4294967296; // 1 << 32 + stored * NEEDLE_PADDING_SIZE as i64 + } + + /// Create an Offset from an actual byte offset. + pub fn from_actual_offset(offset: i64) -> Self { + let smaller = offset / NEEDLE_PADDING_SIZE as i64; + Offset { + b0: smaller as u8, + b1: (smaller >> 8) as u8, + b2: (smaller >> 16) as u8, + b3: (smaller >> 24) as u8, + #[cfg(feature = "5bytes")] + b4: (smaller >> 32) as u8, + } + } + + /// Serialize to bytes in the .idx file format. + /// 5-byte layout: [b3][b2][b1][b0][b4] + /// 4-byte layout: [b3][b2][b1][b0] + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= OFFSET_SIZE); + bytes[0] = self.b3; + bytes[1] = self.b2; + bytes[2] = self.b1; + bytes[3] = self.b0; + #[cfg(feature = "5bytes")] + { + bytes[4] = self.b4; + } + } + + /// Deserialize from bytes in the .idx file format. + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= OFFSET_SIZE); + Offset { + b3: bytes[0], + b2: bytes[1], + b1: bytes[2], + b0: bytes[3], + #[cfg(feature = "5bytes")] + b4: bytes[4], + } + } + + pub fn is_zero(&self) -> bool { + #[cfg(feature = "5bytes")] + { + self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 && self.b4 == 0 + } + #[cfg(not(feature = "5bytes"))] + { + self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 + } + } +} + +impl fmt::Display for Offset { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.to_actual_offset()) + } +} + +// ============================================================================ +// DiskType +// ============================================================================ + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum DiskType { + HardDrive, + Ssd, + Custom(String), +} + +impl DiskType { + pub fn from_string(s: &str) -> Self { + match s.to_lowercase().as_str() { + "" | "hdd" => DiskType::HardDrive, + "ssd" => DiskType::Ssd, + other => DiskType::Custom(other.to_string()), + } + } + + pub fn readable_string(&self) -> &str { + match self { + DiskType::HardDrive => "hdd", + DiskType::Ssd => "ssd", + DiskType::Custom(s) => s, + } + } +} + +impl fmt::Display for DiskType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DiskType::HardDrive => write!(f, ""), + DiskType::Ssd => write!(f, "ssd"), + DiskType::Custom(s) => write!(f, "{}", s), + } + } +} + +impl Default for DiskType { + fn default() -> Self { + DiskType::HardDrive + } +} + +// ============================================================================ +// VolumeId +// ============================================================================ + +/// Volume identifier, stored as u32. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct VolumeId(pub u32); + +impl VolumeId { + pub fn parse(s: &str) -> Result { + s.parse::().map(VolumeId) + } + + pub fn next(&self) -> VolumeId { + VolumeId(self.0 + 1) + } +} + +impl fmt::Display for VolumeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for VolumeId { + fn from(v: u32) -> Self { + VolumeId(v) + } +} + +// ============================================================================ +// Version +// ============================================================================ + +/// Needle storage format version. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Version(pub u8); + +pub const VERSION_1: Version = Version(1); +pub const VERSION_2: Version = Version(2); +pub const VERSION_3: Version = Version(3); + +impl Version { + pub fn current() -> Self { + VERSION_3 + } + + pub fn is_supported(&self) -> bool { + self.0 >= 1 && self.0 <= 3 + } +} + +impl Default for Version { + fn default() -> Self { + VERSION_3 + } +} + +impl From for Version { + fn from(v: u8) -> Self { + Version(v) + } +} + +// ============================================================================ +// ReadOption +// ============================================================================ + +/// Options controlling needle read behavior, matching Go's `ReadOption` in store.go. +/// +/// Fields are split into request-side options (set by the caller) and response-side +/// flags (set during the read to communicate status back). +#[derive(Debug, Clone)] +pub struct ReadOption { + // -- request -- + /// If true, allow reading needles that have been soft-deleted. + pub read_deleted: bool, + /// If true, attempt to read only metadata for large needles (> PagedReadLimit). + pub attempt_meta_only: bool, + /// If true, the caller requires metadata only (no data payload). + pub must_meta_only: bool, + + // -- response -- + /// Set to true when the read actually returned metadata only. + pub is_meta_only: bool, + /// Compaction revision at the time of the read (for consistency during streaming). + pub volume_revision: u16, + /// Set to true when the offset exceeded MaxPossibleVolumeSize (4-byte offset wrap). + pub is_out_of_range: bool, + + // -- slow-read / streaming -- + /// When true, the read lock is acquired and released per chunk instead of held + /// for the entire read, reducing write latency at the cost of higher read P99. + pub has_slow_read: bool, + /// Buffer size for chunked streaming reads (used with `has_slow_read`). + pub read_buffer_size: i32, +} + +impl Default for ReadOption { + fn default() -> Self { + ReadOption { + read_deleted: false, + attempt_meta_only: false, + must_meta_only: false, + is_meta_only: false, + volume_revision: 0, + is_out_of_range: false, + has_slow_read: false, + read_buffer_size: 0, + } + } +} + +// ============================================================================ +// NeedleMapEntry helpers (for .idx file) +// ============================================================================ + +/// Parse a single .idx file entry (17 bytes) into (NeedleId, Offset, Size). +pub fn idx_entry_from_bytes(bytes: &[u8]) -> (NeedleId, Offset, Size) { + assert!(bytes.len() >= NEEDLE_MAP_ENTRY_SIZE); + let key = NeedleId::from_bytes(&bytes[..NEEDLE_ID_SIZE]); + let offset = Offset::from_bytes(&bytes[NEEDLE_ID_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE]); + let size = Size::from_bytes( + &bytes[NEEDLE_ID_SIZE + OFFSET_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE], + ); + (key, offset, size) +} + +/// Write a single .idx file entry (17 bytes). +pub fn idx_entry_to_bytes(bytes: &mut [u8], key: NeedleId, offset: Offset, size: Size) { + assert!(bytes.len() >= NEEDLE_MAP_ENTRY_SIZE); + key.to_bytes(&mut bytes[..NEEDLE_ID_SIZE]); + offset.to_bytes(&mut bytes[NEEDLE_ID_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE]); + size.to_bytes( + &mut bytes[NEEDLE_ID_SIZE + OFFSET_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE], + ); +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_needle_id_round_trip() { + let id = NeedleId(0x123456789abcdef0); + let mut buf = [0u8; 8]; + id.to_bytes(&mut buf); + let id2 = NeedleId::from_bytes(&buf); + assert_eq!(id, id2); + } + + #[test] + fn test_needle_id_display() { + let id = NeedleId(255); + assert_eq!(id.to_string(), "ff"); + } + + #[test] + fn test_needle_id_parse() { + let id = NeedleId::parse("ff").unwrap(); + assert_eq!(id, NeedleId(255)); + } + + #[test] + fn test_cookie_round_trip() { + let cookie = Cookie(0xdeadbeef); + let mut buf = [0u8; 4]; + cookie.to_bytes(&mut buf); + let cookie2 = Cookie::from_bytes(&buf); + assert_eq!(cookie, cookie2); + } + + #[test] + fn test_size_semantics() { + assert!(Size(100).is_valid()); + assert!(!Size(100).is_deleted()); + assert!(!Size(100).is_tombstone()); + assert_eq!(Size(100).raw(), 100); + + assert!(Size(-50).is_deleted()); + assert!(!Size(-50).is_tombstone()); + assert_eq!(Size(-50).raw(), 50); + + assert!(TOMBSTONE_FILE_SIZE.is_deleted()); + assert!(TOMBSTONE_FILE_SIZE.is_tombstone()); + assert_eq!(TOMBSTONE_FILE_SIZE.raw(), 0); + + assert!(!Size(0).is_valid()); + assert!(!Size(0).is_deleted()); + } + + #[test] + fn test_size_round_trip() { + let size = Size(12345); + let mut buf = [0u8; 4]; + size.to_bytes(&mut buf); + let size2 = Size::from_bytes(&buf); + assert_eq!(size, size2); + } + + #[test] + fn test_size_negative_round_trip() { + // Negative sizes round-trip through u32 bit pattern + let size = Size(-50); + let mut buf = [0u8; 4]; + size.to_bytes(&mut buf); + let size2 = Size::from_bytes(&buf); + assert_eq!(size, size2); + } + + #[test] + fn test_offset_round_trip() { + // Test with a known actual offset + let actual_offset: i64 = 8 * 1000000; // must be multiple of 8 + let offset = Offset::from_actual_offset(actual_offset); + assert_eq!(offset.to_actual_offset(), actual_offset); + + // Test byte serialization + let mut buf = [0u8; 5]; + offset.to_bytes(&mut buf); + let offset2 = Offset::from_bytes(&buf); + assert_eq!(offset.to_actual_offset(), offset2.to_actual_offset()); + } + + #[test] + fn test_offset_zero() { + let offset = Offset::default(); + assert!(offset.is_zero()); + assert_eq!(offset.to_actual_offset(), 0); + } + + #[test] + fn test_offset_max() { + // Max stored value depends on offset size + #[cfg(feature = "5bytes")] + let max_stored: i64 = (1i64 << 40) - 1; // 5-byte max + #[cfg(not(feature = "5bytes"))] + let max_stored: i64 = (1i64 << 32) - 1; // 4-byte max + let max_actual = max_stored * NEEDLE_PADDING_SIZE as i64; + let offset = Offset::from_actual_offset(max_actual); + assert_eq!(offset.to_actual_offset(), max_actual); + } + + #[test] + fn test_offset_size_constants() { + #[cfg(feature = "5bytes")] + { + assert_eq!(OFFSET_SIZE, 5); + assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 17); // 8 + 5 + 4 + assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8 * 256); + // 8TB + } + #[cfg(not(feature = "5bytes"))] + { + assert_eq!(OFFSET_SIZE, 4); + assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 16); // 8 + 4 + 4 + assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8); // 32GB + } + } + + #[test] + fn test_idx_entry_round_trip() { + let key = NeedleId(0xdeadbeef12345678); + let offset = Offset::from_actual_offset(8 * 999); + let size = Size(4096); + + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, key, offset, size); + + let (key2, offset2, size2) = idx_entry_from_bytes(&buf); + assert_eq!(key, key2); + assert_eq!(offset.to_actual_offset(), offset2.to_actual_offset()); + assert_eq!(size, size2); + } + + #[test] + fn test_volume_id() { + let vid = VolumeId::parse("42").unwrap(); + assert_eq!(vid, VolumeId(42)); + assert_eq!(vid.to_string(), "42"); + assert_eq!(vid.next(), VolumeId(43)); + } + + #[test] + fn test_version() { + assert!(VERSION_1.is_supported()); + assert!(VERSION_2.is_supported()); + assert!(VERSION_3.is_supported()); + assert!(!Version(0).is_supported()); + assert!(!Version(4).is_supported()); + assert_eq!(Version::current(), VERSION_3); + } + + #[test] + fn test_disk_type() { + assert_eq!(DiskType::from_string(""), DiskType::HardDrive); + assert_eq!(DiskType::from_string("hdd"), DiskType::HardDrive); + assert_eq!(DiskType::from_string("SSD"), DiskType::Ssd); + assert_eq!( + DiskType::from_string("nvme"), + DiskType::Custom("nvme".to_string()) + ); + assert_eq!(DiskType::HardDrive.readable_string(), "hdd"); + assert_eq!(DiskType::Ssd.readable_string(), "ssd"); + } + + #[test] + fn test_read_option_default() { + let ro = ReadOption::default(); + assert!(!ro.read_deleted); + assert!(!ro.attempt_meta_only); + assert!(!ro.must_meta_only); + assert!(!ro.is_meta_only); + assert_eq!(ro.volume_revision, 0); + assert!(!ro.is_out_of_range); + assert!(!ro.has_slow_read); + assert_eq!(ro.read_buffer_size, 0); + } + + #[test] + fn test_read_option_custom() { + let ro = ReadOption { + read_deleted: true, + attempt_meta_only: true, + has_slow_read: true, + read_buffer_size: 1024 * 1024, + ..ReadOption::default() + }; + assert!(ro.read_deleted); + assert!(ro.attempt_meta_only); + assert!(!ro.must_meta_only); + assert!(!ro.is_meta_only); + assert!(ro.has_slow_read); + assert_eq!(ro.read_buffer_size, 1024 * 1024); + } + + #[test] + fn test_read_option_clone() { + let ro = ReadOption { + is_out_of_range: true, + volume_revision: 42, + ..ReadOption::default() + }; + let ro2 = ro.clone(); + assert!(ro2.is_out_of_range); + assert_eq!(ro2.volume_revision, 42); + } +} diff --git a/seaweed-volume/src/storage/volume.rs b/seaweed-volume/src/storage/volume.rs new file mode 100644 index 000000000..28dc761d1 --- /dev/null +++ b/seaweed-volume/src/storage/volume.rs @@ -0,0 +1,4246 @@ +//! Volume: the core storage unit — a .dat file + .idx index. +//! +//! Each volume contains many needles (files). It manages: +//! - Reading/writing/deleting needles from the .dat file +//! - Maintaining the in-memory NeedleMap (NeedleId → Offset+Size) +//! - SuperBlock at offset 0 of the .dat file +//! - Metrics (file count, content size, deleted count) +//! +//! Matches Go's storage/volume.go, volume_loading.go, volume_read.go, +//! volume_write.go, volume_super_block.go. + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::sync::{Condvar, Mutex}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use tracing::warn; + +use crate::storage::needle::needle::{self, get_actual_size, Needle, NeedleError}; +use crate::storage::needle_map::{CompactNeedleMap, NeedleMap, NeedleMapKind, RedbNeedleMap}; +use crate::storage::super_block::{ReplicaPlacement, SuperBlock, SUPER_BLOCK_SIZE}; +use crate::storage::types::*; + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum VolumeError { + #[error("not found")] + NotFound, + + #[error("already deleted")] + Deleted, + + #[error("needle size mismatch")] + SizeMismatch, + + #[error("unsupported version: {0}")] + UnsupportedVersion(u8), + + #[error("cookie mismatch: {0:#x}")] + CookieMismatch(u32), + + #[error("volume not empty")] + NotEmpty, + + #[error("volume already exists")] + AlreadyExists, + + #[error("volume is read-only")] + ReadOnly, + + #[error("volume size limit exceeded: current {current}, limit {limit}")] + SizeLimitExceeded { current: u64, limit: u64 }, + + #[error("volume not initialized")] + NotInitialized, + + #[error("needle error: {0}")] + Needle(#[from] NeedleError), + + #[error("super block error: {0}")] + SuperBlock(#[from] crate::storage::super_block::SuperBlockError), + + #[error("IO error: {0}")] + Io(#[from] io::Error), + + #[error("streaming from remote-backed volume requires buffered fallback")] + StreamingUnsupported, +} + +// ============================================================================ +// VolumeInfo (.vif persistence) +// ============================================================================ + +/// Legacy simple VolumeInfo for backward compat with old .vif files. +#[derive(serde::Serialize, serde::Deserialize)] +struct VolumeInfo { + read_only: bool, +} + +pub use crate::pb::volume_server_pb::RemoteFile as PbRemoteFile; +/// Protobuf VolumeInfo type alias. +pub use crate::pb::volume_server_pb::VolumeInfo as PbVolumeInfo; + +/// Helper module for deserializing protojson uint64 fields that may be strings or numbers. +mod string_or_u64 { + use serde::{self, Deserialize, Deserializer, Serializer}; + + pub fn serialize(value: &u64, serializer: S) -> Result + where + S: Serializer, + { + // Emit as string to match Go's protojson format for uint64 + serializer.serialize_str(&value.to_string()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(untagged)] + enum StringOrNum { + Str(String), + Num(u64), + } + match StringOrNum::deserialize(deserializer)? { + StringOrNum::Str(s) => s.parse::().map_err(serde::de::Error::custom), + StringOrNum::Num(n) => Ok(n), + } + } +} + +mod string_or_i64 { + use serde::{self, Deserialize, Deserializer, Serializer}; + + pub fn serialize(value: &i64, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&value.to_string()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(untagged)] + enum StringOrNum { + Str(String), + Num(i64), + } + match StringOrNum::deserialize(deserializer)? { + StringOrNum::Str(s) => s.parse::().map_err(serde::de::Error::custom), + StringOrNum::Num(n) => Ok(n), + } + } +} + +/// Serde-compatible representation of RemoteFile for .vif JSON serialization. +/// Field names use snake_case to match Go's protobuf JSON output (jsonpb). +#[derive(serde::Serialize, serde::Deserialize, Default, Clone)] +pub struct VifRemoteFile { + #[serde(default, rename = "backendType")] + pub backend_type: String, + #[serde(default, rename = "backendId")] + pub backend_id: String, + #[serde(default)] + pub key: String, + #[serde(default, with = "string_or_u64")] + pub offset: u64, + #[serde(default, rename = "fileSize", with = "string_or_u64")] + pub file_size: u64, + #[serde(default, rename = "modifiedTime", with = "string_or_u64")] + pub modified_time: u64, + #[serde(default)] + pub extension: String, +} + +#[derive(serde::Serialize, serde::Deserialize, Default, Clone)] +pub struct VifEcShardConfig { + #[serde(default, rename = "dataShards")] + pub data_shards: u32, + #[serde(default, rename = "parityShards")] + pub parity_shards: u32, +} + +/// Serde-compatible representation of OldVersionVolumeInfo for legacy .vif JSON deserialization. +/// Matches Go's protobuf OldVersionVolumeInfo where `DestroyTime` maps to `expire_at_sec`. +#[derive(serde::Deserialize, Default)] +struct OldVersionVifVolumeInfo { + #[serde(default)] + pub files: Vec, + #[serde(default)] + pub version: u32, + #[serde(default)] + pub replication: String, + #[serde(default, alias = "bytesOffset", alias = "BytesOffset")] + pub bytes_offset: u32, + #[serde(default, alias = "datFileSize", alias = "dat_file_size", with = "string_or_i64")] + pub dat_file_size: i64, + #[serde(default, alias = "destroyTime", alias = "DestroyTime", with = "string_or_u64")] + pub destroy_time: u64, + #[serde(default, alias = "readOnly", alias = "read_only")] + pub read_only: bool, +} + +impl OldVersionVifVolumeInfo { + /// Convert to the standard VifVolumeInfo, mapping destroy_time -> expire_at_sec. + fn to_vif(self) -> VifVolumeInfo { + VifVolumeInfo { + files: self.files, + version: self.version, + replication: self.replication, + bytes_offset: self.bytes_offset, + dat_file_size: self.dat_file_size, + expire_at_sec: self.destroy_time, + read_only: self.read_only, + ec_shard_config: None, + } + } +} + +/// Serde-compatible representation of VolumeInfo for .vif JSON serialization. +/// Matches Go's protobuf JSON format (jsonpb with EmitUnpopulated=true). +#[derive(serde::Serialize, serde::Deserialize, Default, Clone)] +pub struct VifVolumeInfo { + #[serde(default)] + pub files: Vec, + #[serde(default)] + pub version: u32, + #[serde(default)] + pub replication: String, + #[serde(default, rename = "bytesOffset")] + pub bytes_offset: u32, + #[serde(default, rename = "datFileSize", with = "string_or_i64")] + pub dat_file_size: i64, + #[serde(default, rename = "expireAtSec", with = "string_or_u64")] + pub expire_at_sec: u64, + #[serde(default, rename = "readOnly")] + pub read_only: bool, + #[serde( + default, + rename = "ecShardConfig", + skip_serializing_if = "Option::is_none" + )] + pub ec_shard_config: Option, +} + +impl VifVolumeInfo { + /// Convert from protobuf VolumeInfo to the serde-compatible struct. + pub fn from_pb(pb: &PbVolumeInfo) -> Self { + Self { + files: pb + .files + .iter() + .map(|f| VifRemoteFile { + backend_type: f.backend_type.clone(), + backend_id: f.backend_id.clone(), + key: f.key.clone(), + offset: f.offset, + file_size: f.file_size, + modified_time: f.modified_time, + extension: f.extension.clone(), + }) + .collect(), + version: pb.version, + replication: pb.replication.clone(), + bytes_offset: pb.bytes_offset, + dat_file_size: pb.dat_file_size, + expire_at_sec: pb.expire_at_sec, + read_only: pb.read_only, + ec_shard_config: pb.ec_shard_config.as_ref().map(|c| VifEcShardConfig { + data_shards: c.data_shards, + parity_shards: c.parity_shards, + }), + } + } + + /// Convert to protobuf VolumeInfo. + pub fn to_pb(&self) -> PbVolumeInfo { + PbVolumeInfo { + files: self + .files + .iter() + .map(|f| PbRemoteFile { + backend_type: f.backend_type.clone(), + backend_id: f.backend_id.clone(), + key: f.key.clone(), + offset: f.offset, + file_size: f.file_size, + modified_time: f.modified_time, + extension: f.extension.clone(), + }) + .collect(), + version: self.version, + replication: self.replication.clone(), + bytes_offset: self.bytes_offset, + dat_file_size: self.dat_file_size, + expire_at_sec: self.expire_at_sec, + read_only: self.read_only, + ec_shard_config: self.ec_shard_config.as_ref().map(|c| { + crate::pb::volume_server_pb::EcShardConfig { + data_shards: c.data_shards, + parity_shards: c.parity_shards, + } + }), + } + } +} + +// ============================================================================ +// Streaming read support +// ============================================================================ + +#[derive(Default)] +struct DataFileAccessState { + readers: usize, + writer_active: bool, +} + +#[derive(Default)] +pub struct DataFileAccessControl { + state: Mutex, + condvar: Condvar, +} + +pub struct DataFileReadLease { + control: Arc, +} + +pub struct DataFileWriteLease { + control: Arc, +} + +impl DataFileAccessControl { + pub fn read_lock(self: &Arc) -> DataFileReadLease { + let mut state = self.state.lock().unwrap(); + while state.writer_active { + state = self.condvar.wait(state).unwrap(); + } + state.readers += 1; + drop(state); + DataFileReadLease { + control: self.clone(), + } + } + + pub fn write_lock(self: &Arc) -> DataFileWriteLease { + let mut state = self.state.lock().unwrap(); + while state.writer_active || state.readers > 0 { + state = self.condvar.wait(state).unwrap(); + } + state.writer_active = true; + drop(state); + DataFileWriteLease { + control: self.clone(), + } + } +} + +impl Drop for DataFileReadLease { + fn drop(&mut self) { + let mut state = self.control.state.lock().unwrap(); + state.readers -= 1; + if state.readers == 0 { + self.control.condvar.notify_all(); + } + } +} + +impl Drop for DataFileWriteLease { + fn drop(&mut self) { + let mut state = self.control.state.lock().unwrap(); + state.writer_active = false; + self.control.condvar.notify_all(); + } +} + +/// Information needed to stream needle data directly from the dat file +/// without loading the entire payload into memory. +pub(crate) enum NeedleStreamSource { + Local(File), + Remote(RemoteDatFile), +} + +impl NeedleStreamSource { + pub(crate) fn clone_for_read(&self) -> io::Result { + match self { + NeedleStreamSource::Local(file) => Ok(NeedleStreamSource::Local(file.try_clone()?)), + NeedleStreamSource::Remote(remote) => Ok(NeedleStreamSource::Remote(remote.clone())), + } + } + + pub(crate) fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> io::Result<()> { + match self { + NeedleStreamSource::Local(file) => { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + file.read_exact_at(buf, offset)?; + } + #[cfg(windows)] + { + read_exact_at(file, buf, offset)?; + } + #[cfg(not(any(unix, windows)))] + { + compile_error!("Platform not supported: only unix and windows are supported"); + } + Ok(()) + } + NeedleStreamSource::Remote(remote) => remote.read_exact_at(buf, offset), + } + } +} + +pub struct NeedleStreamInfo { + /// Stream source for the dat file, local or remote. + pub(crate) source: NeedleStreamSource, + /// Absolute byte offset within the dat file where needle data starts. + pub data_file_offset: u64, + /// Size of the data payload in bytes. + pub data_size: u32, + /// Per-volume file access lock used to match Go's slow-read behavior. + pub data_file_access_control: Arc, + /// Volume ID — used to re-lookup needle offset if compaction occurs during streaming. + pub volume_id: VolumeId, + /// Needle ID — used to re-lookup needle offset if compaction occurs during streaming. + pub needle_id: NeedleId, + /// Compaction revision at the time of the initial read. If this changes during + /// streaming, the needle's disk offset must be re-read from the needle map because + /// compaction may have moved the needle to a different location. + pub compaction_revision: u16, +} + +#[derive(Clone)] +pub(crate) struct RemoteDatFile { + backend: Arc, + key: String, + file_size: u64, + modified_time: u64, +} + +impl RemoteDatFile { + pub(crate) fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> io::Result<()> { + let data = self + .backend + .read_range_blocking(&self.key, offset, buf.len()) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + if data.len() != buf.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "remote read short read at offset {}: got {}, expected {}", + offset, + data.len(), + buf.len() + ), + )); + } + buf.copy_from_slice(&data); + Ok(()) + } +} + +// ============================================================================ +// Volume +// ============================================================================ + +pub struct Volume { + pub id: VolumeId, + dir: String, + dir_idx: String, + pub collection: String, + + dat_file: Option, + remote_dat_file: Option, + nm: Option, + needle_map_kind: NeedleMapKind, + data_file_access_control: Arc, + + pub super_block: SuperBlock, + + no_write_or_delete: bool, + no_write_can_delete: bool, + + /// Shared flag from the parent DiskLocation indicating low disk space. + /// Matches Go's `v.location.isDiskSpaceLow` checked in `IsReadOnly()`. + pub location_disk_space_low: Arc, + + last_modified_ts_seconds: u64, + last_append_at_ns: u64, + + last_compact_index_offset: u64, + last_compact_revision: u16, + + is_compacting: bool, + + /// Compaction speed limit in bytes per second (0 = unlimited). + pub compaction_byte_per_second: i64, + + /// Tracks the last I/O error (EIO) for volume health monitoring. + /// Uses Mutex for interior mutability so reads (&self) can clear/set it. + last_io_error: Mutex>, + + /// Protobuf VolumeInfo for tiered storage (.vif file). + pub volume_info: PbVolumeInfo, + + /// Whether this volume has a remote file reference. + pub has_remote_file: bool, +} + +/// Windows helper: loop seek_read until buffer is fully filled. +#[cfg(windows)] +fn read_exact_at(file: &File, buf: &mut [u8], mut offset: u64) -> io::Result<()> { + use std::os::windows::fs::FileExt; + let mut filled = 0; + while filled < buf.len() { + let n = file.seek_read(&mut buf[filled..], offset)?; + if n == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF in seek_read", + )); + } + filled += n; + offset += n as u64; + } + Ok(()) +} + +impl Volume { + /// Create and load a volume from disk. + pub fn new( + dirname: &str, + dir_idx: &str, + collection: &str, + id: VolumeId, + needle_map_kind: NeedleMapKind, + replica_placement: Option, + ttl: Option, + preallocate: u64, + version: Version, + ) -> Result { + let mut v = Volume { + id, + dir: dirname.to_string(), + dir_idx: dir_idx.to_string(), + collection: collection.to_string(), + dat_file: None, + remote_dat_file: None, + nm: None, + needle_map_kind, + data_file_access_control: Arc::new(DataFileAccessControl::default()), + super_block: SuperBlock { + replica_placement: replica_placement.unwrap_or_default(), + ttl: ttl.unwrap_or(crate::storage::needle::ttl::TTL::EMPTY), + ..SuperBlock::default() + }, + no_write_or_delete: false, + no_write_can_delete: false, + location_disk_space_low: Arc::new(AtomicBool::new(false)), + last_modified_ts_seconds: 0, + last_append_at_ns: 0, + last_compact_index_offset: 0, + last_compact_revision: 0, + is_compacting: false, + compaction_byte_per_second: 0, + last_io_error: Mutex::new(None), + volume_info: PbVolumeInfo::default(), + has_remote_file: false, + }; + + v.load(true, true, preallocate, version)?; + Ok(v) + } + + /// Returns true if the volume is currently being compacted. + pub fn is_compacting(&self) -> bool { + self.is_compacting + } + + // ---- File naming (matching Go) ---- + + /// Base filename: dir/collection_id or dir/id + pub fn data_file_name(&self) -> String { + volume_file_name(&self.dir, &self.collection, self.id) + } + + pub fn index_file_name(&self) -> String { + volume_file_name(&self.dir_idx, &self.collection, self.id) + } + + pub fn file_name(&self, ext: &str) -> String { + match ext { + ".idx" | ".cpx" | ".ldb" | ".cpldb" | ".rdb" => { + format!("{}{}", self.index_file_name(), ext) + } + _ => { + format!("{}{}", self.data_file_name(), ext) + } + } + } + + pub fn version(&self) -> Version { + if self.volume_info.version != 0 { + Version(self.volume_info.version as u8) + } else { + self.super_block.version + } + } + + // ---- Loading ---- + + fn load( + &mut self, + also_load_index: bool, + create_dat_if_missing: bool, + preallocate: u64, + version: Version, + ) -> Result<(), VolumeError> { + let dat_path = self.file_name(".dat"); + let mut already_has_super_block = false; + + let has_volume_info_file = self.load_vif()?; + + if self.volume_info.read_only && !self.has_remote_file { + self.no_write_or_delete = true; + } + + if self.has_remote_file { + self.load_remote_dat_file()?; + if let Some(remote_file) = self.volume_info.files.first() { + if remote_file.modified_time > 0 { + self.last_modified_ts_seconds = remote_file.modified_time; + } else if let Ok(metadata) = fs::metadata(self.vif_path()) { + self.last_modified_ts_seconds = metadata + .modified() + .unwrap_or(SystemTime::UNIX_EPOCH) + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + } + } + already_has_super_block = true; + } else if Path::new(&dat_path).exists() { + let metadata = fs::metadata(&dat_path)?; + + // Try to open read-write; fall back to read-only + match OpenOptions::new().read(true).write(true).open(&dat_path) { + Ok(file) => { + self.dat_file = Some(file); + } + Err(e) if e.kind() == io::ErrorKind::PermissionDenied => { + self.dat_file = Some(File::open(&dat_path)?); + self.no_write_or_delete = true; + } + Err(e) => return Err(e.into()), + } + + self.last_modified_ts_seconds = metadata + .modified() + .unwrap_or(SystemTime::UNIX_EPOCH) + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + if metadata.len() >= SUPER_BLOCK_SIZE as u64 { + already_has_super_block = true; + } + } else if create_dat_if_missing { + // Create directory if needed + if let Some(parent) = Path::new(&dat_path).parent() { + fs::create_dir_all(parent)?; + } + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&dat_path)?; + if preallocate > 0 { + preallocate_file(&file, preallocate); + } + self.dat_file = Some(file); + } else { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume data file {} does not exist", dat_path), + ))); + } + + if already_has_super_block { + match self.read_super_block() { + Ok(()) => { + if !self.super_block.version.is_supported() { + return Err(VolumeError::UnsupportedVersion(self.super_block.version.0)); + } + // Match Go: v.volumeInfo.Version = uint32(v.SuperBlock.Version) + self.volume_info.version = self.super_block.version.0 as u32; + } + Err(e) if self.has_remote_file => { + warn!( + volume_id = self.id.0, + error = %e, + "failed to read remote super block during load" + ); + } + Err(e) => return Err(e), + } + } else { + self.maybe_write_super_block(version)?; + } + + if also_load_index { + self.load_index()?; + + // Match Go: CheckVolumeDataIntegrity after loading index (volume_loading.go L154-159) + // Only for non-remote volumes (remote storage may not have local .dat) + if !self.has_remote_file { + if let Err(e) = self.check_volume_data_integrity() { + self.no_write_or_delete = true; + warn!( + volume_id = self.id.0, + error = %e, + "volumeDataIntegrityChecking failed" + ); + } + } + } + + // Match Go: if no .vif file existed, create one with version and bytes_offset + if !has_volume_info_file { + self.volume_info.version = self.super_block.version.0 as u32; + self.volume_info.bytes_offset = OFFSET_SIZE as u32; + if let Err(e) = self.save_volume_info() { + warn!( + volume_id = self.id.0, + error = %e, + "failed to save volume info" + ); + } + } + + Ok(()) + } + + fn load_index(&mut self) -> Result<(), VolumeError> { + let use_redb = matches!( + self.needle_map_kind, + NeedleMapKind::LevelDb | NeedleMapKind::LevelDbMedium | NeedleMapKind::LevelDbLarge + ); + + let idx_path = self.file_name(".idx"); + + // Ensure idx directory exists + if let Some(parent) = Path::new(&idx_path).parent() { + fs::create_dir_all(parent)?; + } + + if use_redb { + self.load_index_redb(&idx_path)?; + } else { + self.load_index_inmemory(&idx_path)?; + } + + Ok(()) + } + + /// Load index using in-memory CompactNeedleMap. + fn load_index_inmemory(&mut self, idx_path: &str) -> Result<(), VolumeError> { + if self.no_write_or_delete { + // Open read-only + if Path::new(&idx_path).exists() { + let mut idx_file = File::open(&idx_path)?; + let nm = CompactNeedleMap::load_from_idx(&mut idx_file)?; + self.nm = Some(NeedleMap::InMemory(nm)); + } else { + // Missing .idx with existing .dat could orphan needles + let dat_path = self.file_name(".dat"); + if Path::new(&dat_path).exists() { + let dat_size = fs::metadata(&dat_path).map(|m| m.len()).unwrap_or(0); + if dat_size > SUPER_BLOCK_SIZE as u64 { + warn!( + volume_id = self.id.0, + ".idx file missing but .dat exists with data; needles may be orphaned" + ); + } + } + self.nm = Some(NeedleMap::InMemory(CompactNeedleMap::new())); + } + } else { + // Open read-write (create if missing) + let idx_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&idx_path)?; + + let idx_size = idx_file.metadata()?.len(); + let mut idx_reader = io::BufReader::new(&idx_file); + let mut nm = CompactNeedleMap::load_from_idx(&mut idx_reader)?; + + // Re-open for append-only writes + let write_file = OpenOptions::new() + .write(true) + .append(true) + .open(&idx_path)?; + nm.set_idx_file(Box::new(write_file), idx_size); + self.nm = Some(NeedleMap::InMemory(nm)); + } + + Ok(()) + } + + /// Load index using disk-backed RedbNeedleMap. + fn load_index_redb(&mut self, idx_path: &str) -> Result<(), VolumeError> { + // The redb database file is stored alongside the volume files + let rdb_path = self.file_name(".rdb"); + + if self.no_write_or_delete { + // Open read-only + if Path::new(&idx_path).exists() { + let mut idx_file = File::open(&idx_path)?; + let nm = RedbNeedleMap::load_from_idx(&rdb_path, &mut idx_file)?; + self.nm = Some(NeedleMap::Redb(nm)); + } else { + // Missing .idx with existing .dat could orphan needles + let dat_path = self.file_name(".dat"); + if Path::new(&dat_path).exists() { + let dat_size = fs::metadata(&dat_path).map(|m| m.len()).unwrap_or(0); + if dat_size > SUPER_BLOCK_SIZE as u64 { + warn!( + volume_id = self.id.0, + ".idx file missing but .dat exists with data; needles may be orphaned" + ); + } + } + self.nm = Some(NeedleMap::Redb(RedbNeedleMap::new(&rdb_path)?)); + } + } else { + // Open read-write (create if missing) + let idx_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&idx_path)?; + + let idx_size = idx_file.metadata()?.len(); + let mut idx_reader = io::BufReader::new(&idx_file); + let mut nm = RedbNeedleMap::load_from_idx(&rdb_path, &mut idx_reader)?; + + // Re-open for append-only writes + let write_file = OpenOptions::new() + .write(true) + .append(true) + .open(&idx_path)?; + nm.set_idx_file(Box::new(write_file), idx_size); + self.nm = Some(NeedleMap::Redb(nm)); + } + + Ok(()) + } + + fn load_remote_dat_file(&mut self) -> Result<(), VolumeError> { + let (storage_name, storage_key) = self.remote_storage_name_key(); + let backend = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap() + .get(&storage_name) + .ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("remote tier backend {} not found", storage_name), + )) + })?; + + let remote_file = self.volume_info.files.first().ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + "remote volume has no remote file entries", + )) + })?; + + let file_size = if remote_file.file_size > 0 { + remote_file.file_size + } else if self.volume_info.dat_file_size > 0 { + self.volume_info.dat_file_size as u64 + } else { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!("remote volume {} is missing file size metadata", self.id.0), + ))); + }; + + self.dat_file = None; + self.remote_dat_file = Some(RemoteDatFile { + backend, + key: storage_key, + file_size, + modified_time: remote_file.modified_time, + }); + Ok(()) + } + + fn read_exact_at_backend(&self, buf: &mut [u8], offset: u64) -> Result<(), VolumeError> { + if let Some(dat_file) = self.dat_file.as_ref() { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + dat_file.read_exact_at(buf, offset)?; + } + #[cfg(windows)] + { + read_exact_at(dat_file, buf, offset)?; + } + #[cfg(not(any(unix, windows)))] + { + compile_error!("Platform not supported: only unix and windows are supported"); + } + Ok(()) + } else if let Some(remote_dat_file) = self.remote_dat_file.as_ref() { + remote_dat_file.read_exact_at(buf, offset)?; + Ok(()) + } else { + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + "dat file not open", + ))) + } + } + + /// Returns true when the volume has a data backend (local .dat file or + /// remote tiered storage). Mirrors Go's `v.DataBackend != nil` check. + pub fn has_data_backend(&self) -> bool { + self.dat_file.is_some() || self.remote_dat_file.is_some() + } + + fn current_dat_file_size(&self) -> io::Result { + if let Some(ref f) = self.dat_file { + Ok(f.metadata()?.len()) + } else if let Some(ref remote_dat_file) = self.remote_dat_file { + Ok(remote_dat_file.file_size) + } else { + Ok(0) + } + } + + /// Read a raw byte range from the current .dat backend. + /// + /// This matches Go paths that stream directly from `DataBackend`, including + /// remote-only tiered volumes whose `.dat` is no longer present locally. + pub fn read_dat_slice(&self, offset: u64, size: usize) -> Result, VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + let dat_size = self.current_dat_file_size()?; + if size == 0 || offset >= dat_size { + return Ok(Vec::new()); + } + + let read_len = std::cmp::min(size as u64, dat_size - offset) as usize; + let mut buf = vec![0u8; read_len]; + self.read_exact_at_backend(&mut buf, offset)?; + Ok(buf) + } + + // ---- SuperBlock I/O ---- + + fn read_super_block(&mut self) -> Result<(), VolumeError> { + let mut header = [0u8; SUPER_BLOCK_SIZE]; + self.read_exact_at_backend(&mut header, 0)?; + + let extra_size = u16::from_be_bytes([header[6], header[7]]); + let total_size = SUPER_BLOCK_SIZE + extra_size as usize; + + let mut full_buf = vec![0u8; total_size]; + full_buf[..SUPER_BLOCK_SIZE].copy_from_slice(&header); + if extra_size > 0 { + self.read_exact_at_backend(&mut full_buf[SUPER_BLOCK_SIZE..], SUPER_BLOCK_SIZE as u64)?; + } + + self.super_block = SuperBlock::from_bytes(&full_buf)?; + + // Match Go: if volumeInfo.Replication is set, override super block's ReplicaPlacement + if !self.volume_info.replication.is_empty() { + let rp = ReplicaPlacement::from_string(&self.volume_info.replication)?; + self.super_block.replica_placement = rp; + } + + Ok(()) + } + + fn maybe_write_super_block(&mut self, version: Version) -> Result<(), VolumeError> { + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + + let dat_size = dat_file.metadata()?.len(); + if dat_size == 0 { + if !version.is_supported() { + return Err(VolumeError::UnsupportedVersion(version.0)); + } + self.super_block.version = version; + let bytes = self.super_block.to_bytes(); + dat_file.seek(SeekFrom::Start(0))?; + dat_file.write_all(&bytes)?; + dat_file.sync_all()?; + } + Ok(()) + } + + // ---- Read ---- + + /// Read a needle by its ID from the volume. + pub fn read_needle(&self, n: &mut Needle) -> Result { + let mut read_option = ReadOption::default(); + self.read_needle_with_option(n, &mut read_option) + } + + pub fn read_needle_opt(&self, n: &mut Needle, read_deleted: bool) -> Result { + let mut read_option = ReadOption { + read_deleted, + ..ReadOption::default() + }; + self.read_needle_with_option(n, &mut read_option) + } + + pub fn read_needle_with_option( + &self, + n: &mut Needle, + read_option: &mut ReadOption, + ) -> Result { + let _guard = self.data_file_access_control.read_lock(); + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let nv = nm.get(n.id).ok_or(VolumeError::NotFound)?; + + if nv.offset.is_zero() { + return Err(VolumeError::NotFound); + } + + let mut read_size = nv.size; + if read_size.is_deleted() { + if read_option.read_deleted && !read_size.is_tombstone() { + // Negate to get original size + read_size = Size(-read_size.0); + } else { + return Err(VolumeError::Deleted); + } + } + if read_size.0 == 0 { + return Ok(0); + } + + match self.read_needle_data_at_unlocked(n, nv.offset.to_actual_offset(), read_size, read_option) { + Ok(()) => self.check_read_write_error(None), + Err(VolumeError::Io(ref e)) => { + self.check_read_write_error(Some(e)); + return Err(VolumeError::Io(io::Error::new(e.kind(), e.to_string()))); + } + Err(e) => return Err(e), + } + + // TTL expiry check + if n.has_ttl() { + if let Some(ref ttl) = n.ttl { + let ttl_minutes = ttl.minutes(); + if ttl_minutes > 0 && n.has_last_modified_date() { + let expire_at_ns = n.append_at_ns + (ttl_minutes as u64) * 60 * 1_000_000_000; + let now_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + if now_ns >= expire_at_ns { + return Err(VolumeError::NotFound); + } + } + } + } + + Ok(n.data_size as i32) + } + + /// Read needle data from .dat file at given offset. + pub fn read_needle_data_at( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + let mut read_option = ReadOption::default(); + self.read_needle_data_at_unlocked(n, offset, size, &mut read_option) + } + + fn read_needle_data_at_unlocked( + &self, + n: &mut Needle, + offset: i64, + size: Size, + _read_option: &mut ReadOption, + ) -> Result<(), VolumeError> { + match self.read_needle_blob_and_parse(n, offset, size) { + Ok(()) => Ok(()), + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + // Double-read: in 4-byte offset mode, the actual data may be + // beyond 32GB due to offset wrapping. Retry at offset + 32GB. + self.read_needle_blob_and_parse(n, offset + MAX_POSSIBLE_VOLUME_SIZE as i64, size) + } + Err(e) => Err(e), + } + } + + fn read_needle_blob_and_parse( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let version = self.version(); + let actual_size = get_actual_size(size, version); + + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, offset as u64)?; + + n.read_bytes(&mut buf, offset, size, version)?; + Ok(()) + } + + /// Read raw needle blob at a specific offset. + pub fn read_needle_blob(&self, offset: i64, size: Size) -> Result, VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + self.read_needle_blob_unlocked(offset, size) + } + + fn read_needle_blob_unlocked(&self, offset: i64, size: Size) -> Result, VolumeError> { + let version = self.version(); + let actual_size = get_actual_size(size, version); + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, offset as u64)?; + + Ok(buf) + } + + /// Read needle metadata at a specific offset without loading the data payload. + /// + /// Matches Go's `readNeedleMetaAt`, including the tombstone path where a + /// deleted idx entry passes a negative size and the tombstone record itself + /// is read as size 0 metadata. + pub fn read_needle_meta_at( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + self.read_needle_meta_at_unlocked(n, offset, size) + } + + fn read_needle_meta_at_unlocked( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let normalized_size = if size.is_deleted() { Size(0) } else { size }; + match self.read_needle_meta_blob_and_parse(n, offset, normalized_size) { + Ok(()) => Ok(()), + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + self.read_needle_meta_blob_and_parse( + n, + offset + MAX_POSSIBLE_VOLUME_SIZE as i64, + normalized_size, + ) + } + Err(e) => Err(e), + } + } + + fn read_needle_meta_blob_and_parse( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let version = self.version(); + + // Step 1: Read only the first 20 bytes (header + DataSize). + // Matches Go's ReadNeedleMeta which reads NeedleHeaderSize+DataSizeSize first. + const HEADER_PREFIX: usize = NEEDLE_HEADER_SIZE + DATA_SIZE_SIZE; // 20 + let mut header_buf = [0u8; HEADER_PREFIX]; + self.read_exact_at_backend(&mut header_buf, offset as u64)?; + + // Parse header to get the needle's Size field for validation + let (_, _, found_size) = Needle::parse_header(&header_buf); + if found_size != size { + return Err(VolumeError::Needle(NeedleError::SizeMismatch { + offset, + id: n.id, + found: found_size, + expected: size, + })); + } + + // Step 2: Calculate how much meta tail to read (skip the data payload) + let actual_size = get_actual_size(size, version); + + if size.0 == 0 || version == VERSION_1 { + // Tombstone or V1: no body data section, tail starts right after header + let meta_size = actual_size - NEEDLE_HEADER_SIZE as i64; + if meta_size < 0 || meta_size > 128 * 1024 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "invalid needle meta size {}: DataSize=0, size={}, offset={}", + meta_size, size.0, offset + ), + ))); + } + let mut meta_buf = vec![0u8; meta_size as usize]; + self.read_exact_at_backend( + &mut meta_buf, + (offset + NEEDLE_HEADER_SIZE as i64) as u64, + )?; + n.read_paged_meta(&header_buf, &meta_buf, offset, size, version)?; + } else { + // V2/V3: extract DataSize from bytes 16..20 + let data_size = u32::from_be_bytes([ + header_buf[NEEDLE_HEADER_SIZE], + header_buf[NEEDLE_HEADER_SIZE + 1], + header_buf[NEEDLE_HEADER_SIZE + 2], + header_buf[NEEDLE_HEADER_SIZE + 3], + ]); + + // Skip past: header(16) + DataSize(4) + data(data_size) + let start_offset = + offset + NEEDLE_HEADER_SIZE as i64 + DATA_SIZE_SIZE as i64 + data_size as i64; + let stop_offset = offset + actual_size; + let meta_size = stop_offset - start_offset; + + // Sanity check: reject metadata sizes > 128KB (matching Go's ReadNeedleMeta guard) + if meta_size < 0 || meta_size > 128 * 1024 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "invalid needle meta size {}: DataSize={}, size={}, offset={}", + meta_size, data_size, size.0, offset + ), + ))); + } + + // Step 3: Read only the meta tail (skip the data payload entirely) + let mut meta_buf = vec![0u8; meta_size as usize]; + self.read_exact_at_backend(&mut meta_buf, start_offset as u64)?; + n.read_paged_meta(&header_buf, &meta_buf, offset, size, version)?; + } + + Ok(()) + } + + /// Read needle metadata (header + flags/name/mime/etc) without loading the data payload, + /// and return a `NeedleStreamInfo` that can be used to stream data directly from the dat file. + /// + /// This is used for large needles to avoid loading the entire payload into memory. + pub fn read_needle_stream_info( + &self, + n: &mut Needle, + read_deleted: bool, + ) -> Result { + let _guard = self.data_file_access_control.read_lock(); + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let nv = nm.get(n.id).ok_or(VolumeError::NotFound)?; + + if nv.offset.is_zero() { + return Err(VolumeError::NotFound); + } + + let mut read_size = nv.size; + if read_size.is_deleted() { + if read_deleted && !read_size.is_tombstone() { + read_size = Size(-read_size.0); + } else { + return Err(VolumeError::Deleted); + } + } + if read_size.0 == 0 { + return Err(VolumeError::NotFound); + } + + #[cfg_attr(feature = "5bytes", allow(unused_mut))] + let mut offset = nv.offset.to_actual_offset(); + let version = self.version(); + let actual_size = get_actual_size(read_size, version); + + // Read the full needle bytes (including data) for metadata parsing. + // We use read_bytes_meta_only which skips copying the data payload. + #[cfg_attr(feature = "5bytes", allow(unused_mut))] + let mut read_and_parse = |off: i64| -> Result<(), VolumeError> { + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, off as u64)?; + n.read_bytes_meta_only(&mut buf, off, read_size, version)?; + Ok(()) + }; + + match read_and_parse(offset) { + Ok(()) => {} + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + offset += MAX_POSSIBLE_VOLUME_SIZE as i64; + read_and_parse(offset)?; + } + Err(e) => return Err(e), + } + + // TTL expiry check + if n.has_ttl() { + if let Some(ref ttl) = n.ttl { + let ttl_minutes = ttl.minutes(); + if ttl_minutes > 0 && n.has_last_modified_date() { + let expire_at_ns = n.append_at_ns + (ttl_minutes as u64) * 60 * 1_000_000_000; + let now_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + if now_ns >= expire_at_ns { + return Err(VolumeError::NotFound); + } + } + } + } + + // For V1, data starts right after the header + // For V2/V3, data starts at header + 4 (DataSize field) + let data_file_offset = if version == VERSION_1 { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + } else { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + 4 // skip DataSize (4 bytes) + }; + + let source = match (self.dat_file.as_ref(), self.remote_dat_file.as_ref()) { + (Some(dat_file), _) => NeedleStreamSource::Local( + dat_file.try_clone().map_err(VolumeError::Io)?, + ), + (None, Some(remote_dat_file)) => NeedleStreamSource::Remote(remote_dat_file.clone()), + (None, None) => return Err(VolumeError::StreamingUnsupported), + }; + + Ok(NeedleStreamInfo { + source, + data_file_offset, + data_size: n.data_size, + data_file_access_control: self.data_file_access_control.clone(), + volume_id: self.id, + needle_id: n.id, + compaction_revision: self.super_block.compaction_revision, + }) + } + + /// Re-lookup a needle's data-file offset after compaction may have moved it. + /// + /// Returns `(new_data_file_offset, current_compaction_revision)` or an error + /// if the needle is no longer present / has been deleted. + /// + /// This matches Go's `readNeedleDataInto` behaviour: when the volume's + /// `CompactionRevision` changes between streaming chunks, the needle offset + /// is re-read from the needle map because compaction may have relocated it. + pub fn re_lookup_needle_data_offset( + &self, + needle_id: NeedleId, + ) -> Result<(u64, u16), VolumeError> { + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let nv = nm.get(needle_id).ok_or(VolumeError::NotFound)?; + if nv.offset.is_zero() { + return Err(VolumeError::NotFound); + } + if nv.size.is_deleted() { + return Err(VolumeError::Deleted); + } + + let offset = nv.offset.to_actual_offset(); + let version = self.version(); + + let data_file_offset = if version == VERSION_1 { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + } else { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + 4 // skip DataSize (4 bytes) + }; + + Ok((data_file_offset, self.super_block.compaction_revision)) + } + + // ---- Write ---- + + /// Write a needle to the volume (synchronous path). + pub fn write_needle( + &mut self, + n: &mut Needle, + check_cookie: bool, + ) -> Result<(u64, Size, bool), VolumeError> { + let _guard = self.data_file_access_control.write_lock(); + if self.is_read_only() { + return Err(VolumeError::ReadOnly); + } + + self.do_write_request(n, check_cookie) + } + + fn do_write_request( + &mut self, + n: &mut Needle, + check_cookie: bool, + ) -> Result<(u64, Size, bool), VolumeError> { + // TTL inheritance from volume (matching Go's writeNeedle2) + { + use crate::storage::needle::ttl::TTL; + let needle_ttl = n.ttl.unwrap_or(TTL::EMPTY); + if needle_ttl == TTL::EMPTY && self.super_block.ttl != TTL::EMPTY { + n.set_has_ttl(); + n.ttl = Some(self.super_block.ttl); + } + } + + // Ensure checksum is computed before dedup check + if n.checksum == crate::storage::needle::crc::CRC(0) && !n.data.is_empty() { + n.checksum = crate::storage::needle::crc::CRC::new(&n.data); + } + + // Dedup check (matches Go: n.DataSize = oldNeedle.DataSize on dedup) + if let Some(old_data_size) = self.is_file_unchanged(n) { + n.data_size = old_data_size; + return Ok((0, Size(n.data_size as i32), true)); + } + + // Cookie validation for existing needle (matches Go: check whenever nm.Get returns ok) + if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(n.id) { + let mut existing = Needle::default(); + // Read only the header to check cookie + self.read_needle_header_unlocked(&mut existing, nv.offset.to_actual_offset())?; + + if n.cookie.0 == 0 && !check_cookie { + n.cookie = existing.cookie; + } + if existing.cookie != n.cookie { + return Err(VolumeError::CookieMismatch(n.cookie.0)); + } + } + } + + // Update append timestamp + n.append_at_ns = get_append_at_ns(self.last_append_at_ns); + + // Append to .dat file + let (offset, _body_size, _actual_size) = self.append_needle(n)?; + self.last_append_at_ns = n.append_at_ns; + + // Update needle map (uses n.size = full body size, matching Go's nm.Put) + let should_update = if let Some(nm) = &self.nm { + match nm.get(n.id) { + Some(nv) => (nv.offset.to_actual_offset() as u64) < offset, + None => true, + } + } else { + true + }; + + if should_update { + if let Some(nm) = &mut self.nm { + nm.put(n.id, Offset::from_actual_offset(offset as i64), n.size)?; + } + } + + if self.last_modified_ts_seconds < n.last_modified { + self.last_modified_ts_seconds = n.last_modified; + } + + // Return Size(n.DataSize) as the logical size, matching Go's doWriteRequest + Ok((offset, Size(n.data_size as i32), false)) + } + + fn read_needle_header_unlocked(&self, n: &mut Needle, offset: i64) -> Result<(), VolumeError> { + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + self.read_exact_at_backend(&mut header, offset as u64)?; + + n.read_header(&header); + Ok(()) + } + + /// Check if the needle is unchanged from the existing one on disk. + /// Returns `Some(old_data_size)` if unchanged, `None` otherwise. + /// Matches Go's isFileUnchanged which also sets n.DataSize = oldNeedle.DataSize. + fn is_file_unchanged(&self, n: &Needle) -> Option { + // Don't dedup for volumes with TTL + if self.super_block.ttl != crate::storage::needle::ttl::TTL::EMPTY { + return None; + } + + if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(n.id) { + if !nv.offset.is_zero() && nv.size.is_valid() { + let mut old = Needle::default(); + let mut ro = ReadOption::default(); + if self + .read_needle_data_at_unlocked( + &mut old, + nv.offset.to_actual_offset(), + nv.size, + &mut ro, + ) + .is_ok() + { + if old.cookie == n.cookie + && old.checksum == n.checksum + && old.data == n.data + { + return Some(old.data_size); + } + } + } + } + } + None + } + + /// Append a needle to the .dat file. Returns (offset, size, actual_size). + fn append_needle(&mut self, n: &mut Needle) -> Result<(u64, Size, i64), VolumeError> { + let version = self.version(); + let bytes = n.write_bytes(version); + let actual_size = bytes.len() as i64; + + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + + let offset = dat_file.seek(SeekFrom::End(0))?; + + // Check volume size limit before writing (matching Go's Append) + if offset >= MAX_POSSIBLE_VOLUME_SIZE && !n.data.is_empty() { + return Err(VolumeError::SizeLimitExceeded { + current: offset, + limit: MAX_POSSIBLE_VOLUME_SIZE, + }); + } + + if let Err(e) = dat_file.write_all(&bytes) { + // Truncate back to pre-write position on error (matching Go) + let _ = dat_file.set_len(offset); + self.check_read_write_error(Some(&e)); + return Err(VolumeError::Io(e)); + } + self.check_read_write_error(None); + + Ok((offset, n.size, actual_size)) + } + + // ---- Delete ---- + + /// Delete a needle from the volume. + pub fn delete_needle(&mut self, n: &mut Needle) -> Result { + let _guard = self.data_file_access_control.write_lock(); + if self.no_write_or_delete { + return Err(VolumeError::ReadOnly); + } + self.do_delete_request(n) + } + + fn do_delete_request(&mut self, n: &mut Needle) -> Result { + let (found, size, _stored_offset) = if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(n.id) { + if !nv.size.is_deleted() { + (true, nv.size, nv.offset) + } else { + (false, Size(0), Offset::default()) + } + } else { + (false, Size(0), Offset::default()) + } + } else { + return Ok(Size(0)); + }; + + if !found { + return Ok(Size(0)); + } + + // Write tombstone: append needle with empty data + n.data = vec![]; + n.append_at_ns = get_append_at_ns(self.last_append_at_ns); + + let offset = if !self.has_remote_file { + // Normal volume: append tombstone to .dat file + let (offset, _, _) = self.append_needle(n)?; + offset + } else { + // Remote-tiered volume: skip .dat append, use offset 0 + 0 + }; + self.last_append_at_ns = n.append_at_ns; + + // Update index + if let Some(nm) = &mut self.nm { + nm.delete(n.id, Offset::from_actual_offset(offset as i64))?; + } + + Ok(size) + } + + // ---- Metrics ---- + + pub fn content_size(&self) -> u64 { + self.nm.as_ref().map_or(0, |nm| nm.content_size()) + } + + pub fn deleted_size(&self) -> u64 { + self.nm.as_ref().map_or(0, |nm| nm.deleted_size()) + } + + pub fn file_count(&self) -> i64 { + self.nm.as_ref().map_or(0, |nm| nm.file_count()) + } + + pub fn deleted_count(&self) -> i64 { + self.nm.as_ref().map_or(0, |nm| nm.deleted_count()) + } + + pub fn max_file_key(&self) -> NeedleId { + self.nm.as_ref().map_or(NeedleId(0), |nm| nm.max_file_key()) + } + + pub fn is_read_only(&self) -> bool { + self.no_write_or_delete + || self.no_write_can_delete + || self.location_disk_space_low.load(Ordering::Relaxed) + } + + pub fn is_no_write_or_delete(&self) -> bool { + self.no_write_or_delete + } + + pub fn is_no_write_can_delete(&self) -> bool { + self.no_write_can_delete + } + + pub fn last_compact_revision(&self) -> u16 { + self.last_compact_revision + } + + pub fn last_modified_ts(&self) -> u64 { + self.last_modified_ts_seconds + } + + pub fn is_expired(&self, volume_size: u64, volume_size_limit: u64) -> bool { + if volume_size_limit == 0 { + return false; + } + if volume_size <= SUPER_BLOCK_SIZE as u64 { + return false; + } + let ttl_minutes = self.super_block.ttl.minutes(); + if ttl_minutes == 0 { + return false; + } + let lived_minutes = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .saturating_sub(self.last_modified_ts_seconds) + / 60; + (ttl_minutes as u64) < lived_minutes + } + + pub fn is_expired_long_enough(&self, max_delay_minutes: u32) -> bool { + let ttl_minutes = self.super_block.ttl.minutes(); + if ttl_minutes == 0 { + return false; + } + let removal_delay = std::cmp::min(ttl_minutes / 10, max_delay_minutes); + ((ttl_minutes + removal_delay) as u64) * 60 + self.last_modified_ts_seconds + < SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + } + + /// Read all live needles from the volume (for ReadAllNeedles streaming RPC). + pub fn read_all_needles(&self) -> Result, VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let version = self.version(); + let dat_size = self.current_dat_file_size()? as i64; + let mut needles = Vec::new(); + let mut offset = self.super_block.block_size() as i64; + + while offset < dat_size { + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header, offset as u64) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e), + } + + let (_cookie, key, size) = Needle::parse_header(&header); + if size.0 == 0 && key.is_empty() { + break; + } + + let body_length = needle::needle_body_length(size, version); + let total_size = NEEDLE_HEADER_SIZE as i64 + body_length as i64; + + if size.is_deleted() || size.0 <= 0 { + offset += total_size; + continue; + } + + let Some(nv) = nm.get(key) else { + offset += total_size; + continue; + }; + if nv.offset.to_actual_offset() != offset { + offset += total_size; + continue; + } + + let mut n = Needle { + id: key, + ..Needle::default() + }; + let mut read_option = ReadOption::default(); + self.read_needle_data_at_unlocked(&mut n, offset, size, &mut read_option)?; + needles.push(n); + + offset += total_size; + } + Ok(needles) + } + + /// Check volume data integrity by verifying the last index entries against the .dat file. + /// Matches Go's CheckVolumeDataIntegrity (volume_checking.go L117-141). + /// Reads the last few index entries, verifies each needle header is readable and + /// consistent. On failure, marks the volume read-only. + fn check_volume_data_integrity(&mut self) -> Result<(), VolumeError> { + let idx_path = self.file_name(".idx"); + if !Path::new(&idx_path).exists() { + return Ok(()); + } + + let idx_size = fs::metadata(&idx_path).map(|m| m.len()).unwrap_or(0) as i64; + if idx_size == 0 { + return Ok(()); + } + if idx_size % NEEDLE_MAP_ENTRY_SIZE as i64 != 0 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "index file's size is {} bytes, maybe corrupted", + idx_size + ), + ))); + } + + let version = self.version(); + + // Check last 10 index entries (matching Go's CheckVolumeDataIntegrity). + // Go starts healthyIndexSize = indexSize and reduces on EOF. + // On success: break (err != ErrorSizeMismatch when err == nil). + // On EOF: set healthyIndexSize = position of corrupt entry, continue. + // On ErrorSizeMismatch: continue (try next entry). + // After loop: if healthyIndexSize < indexSize → error. + let mut idx_file = File::open(&idx_path)?; + let max_entries = std::cmp::min(10, idx_size / NEEDLE_MAP_ENTRY_SIZE as i64); + let mut healthy_index_size: i64 = idx_size; + + for i in 1..=max_entries { + let entry_offset = idx_size - i * NEEDLE_MAP_ENTRY_SIZE as i64; + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_file.seek(SeekFrom::Start(entry_offset as u64))?; + idx_file.read_exact(&mut buf)?; + + let (key, offset, size) = idx_entry_from_bytes(&buf); + if offset.is_zero() { + continue; + } + + let actual_offset = offset.to_actual_offset() as u64; + + // Read needle header at the offset + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header, actual_offset) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => { + // Match Go: on EOF, mark this entry as corrupt and continue + // checking earlier entries (healthyIndexSize tracks the boundary). + healthy_index_size = entry_offset; + continue; + } + Err(e) => return Err(e), + } + + let (_cookie, needle_id, needle_size) = Needle::parse_header(&header); + + // Verify the needle ID matches the index entry + if !key.is_empty() && needle_id != key { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "index key {:?} does not match needle Id {:?} at offset {}", + key, needle_id, actual_offset + ), + ))); + } + + // For non-deleted entries, verify the size matches + if !size.is_deleted() && size.0 > 0 && needle_size.0 != size.0 { + // Try with MaxPossibleVolumeSize offset adjustment (Go parity) + let alt_offset = actual_offset + MAX_POSSIBLE_VOLUME_SIZE as u64; + let mut alt_header = [0u8; NEEDLE_HEADER_SIZE]; + if self + .read_exact_at_backend(&mut alt_header, alt_offset) + .is_ok() + { + let (_, _, alt_size) = Needle::parse_header(&alt_header); + if alt_size.0 == size.0 { + continue; + } + } + // Match Go: ErrorSizeMismatch breaks out of the loop + break; + } + + // If V3, try to read the append timestamp from the last verified entry. + // Go reads AppendAtNs from both live and deleted (tombstone) entries + // via verifyNeedleIntegrity and verifyDeletedNeedleIntegrity. + if version == VERSION_3 { + // For tombstones (deleted), body size on disk is 0. + // For live entries, body size is size.0. + let body_size = if size.is_deleted() { 0u64 } else { size.0 as u64 }; + let ts_offset = + actual_offset + NEEDLE_HEADER_SIZE as u64 + body_size + 4; // skip checksum + let mut ts_buf = [0u8; 8]; + if self.read_exact_at_backend(&mut ts_buf, ts_offset).is_ok() { + let ts = u64::from_be_bytes(ts_buf); + if ts > 0 { + self.last_append_at_ns = ts; + } + } + } + } + + // Match Go: if healthyIndexSize < indexSize, trailing entries are corrupt + if healthy_index_size < idx_size { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "healthy index size {} is less than expected {}", + healthy_index_size, idx_size + ), + ))); + } + + Ok(()) + } + + /// Scrub the volume index by verifying each needle map entry against the dat file. + /// For each entry, reads only the 16-byte needle header at the given offset to verify: + /// correct needle ID, correct cookie (non-zero), and valid size. + /// Does NOT read/verify the full needle data or CRC. + /// Returns (files_checked, broken_needles) tuple. + pub fn scrub_index(&self) -> Result<(u64, Vec), VolumeError> { + if self.dat_file.is_none() && self.remote_dat_file.is_none() { + return Err(VolumeError::NotFound); + } + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let dat_size = self.dat_file_size().map_err(VolumeError::Io)?; + + let mut files_checked: u64 = 0; + let mut broken = Vec::new(); + + for (needle_id, nv) in nm.iter_entries() { + if nv.offset.is_zero() || nv.size.is_deleted() { + continue; + } + + let offset = nv.offset.to_actual_offset(); + if offset < 0 || offset as u64 >= dat_size { + broken.push(format!( + "needle {} offset {} out of range (dat_size={})", + needle_id.0, offset, dat_size + )); + continue; + } + + // Read only the 16-byte needle header to verify ID, cookie, and size + let mut header_buf = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header_buf, offset as u64) { + Ok(()) => { + let (cookie, id, size) = Needle::parse_header(&header_buf); + if id != needle_id { + broken.push(format!( + "needle {} header id mismatch: expected {}, got {}", + needle_id.0, needle_id.0, id.0 + )); + } else if cookie.0 == 0 { + broken.push(format!( + "needle {} has zero cookie at offset {}", + needle_id.0, offset + )); + } else if size.0 <= 0 && !nv.size.is_deleted() { + broken.push(format!( + "needle {} has invalid size {} at offset {}", + needle_id.0, size.0, offset + )); + } + } + Err(e) => { + broken.push(format!("needle {} read header error: {}", needle_id.0, e)); + } + } + + files_checked += 1; + } + + Ok((files_checked, broken)) + } + + /// Scrub the volume by reading and verifying all needles. + /// Returns (files_checked, broken_needles) tuple. + /// Each needle is read from disk and its CRC checksum is verified. + pub fn scrub(&self) -> Result<(u64, Vec), VolumeError> { + if self.dat_file.is_none() && self.remote_dat_file.is_none() { + return Err(VolumeError::NotFound); + } + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + + let dat_size = self.dat_file_size().map_err(|e| VolumeError::Io(e))?; + let version = self.version(); + + let mut files_checked: u64 = 0; + let mut broken = Vec::new(); + let mut total_read: i64 = 0; + + for (needle_id, nv) in nm.iter_entries() { + if nv.offset.is_zero() { + continue; + } + + // Accumulate actual needle size for ALL entries including deleted ones + // (matches Go: deleted needles still occupy space in the .dat file). + total_read += get_actual_size(nv.size, version); + + if nv.size.is_deleted() { + continue; + } + + let offset = nv.offset.to_actual_offset(); + if offset < 0 || offset as u64 >= dat_size { + broken.push(format!( + "needle {} offset {} out of range (dat_size={})", + needle_id.0, offset, dat_size + )); + continue; + } + + // Read and verify the needle (read_needle_data_at checks CRC via read_bytes/read_tail) + let mut n = Needle { + id: needle_id, + ..Needle::default() + }; + match self.read_needle_data_at(&mut n, offset, nv.size) { + Ok(_) => {} + Err(e) => { + broken.push(format!("needle {} error: {}", needle_id.0, e)); + } + } + + files_checked += 1; + } + + // Validate total data size against .dat file size (matches Go's scrubVolumeData) + let expected_size = total_read + SUPER_BLOCK_SIZE as i64; + if (dat_size as i64) < expected_size { + broken.push(format!( + "dat file size {} is smaller than expected {} (total_read {} + super_block {})", + dat_size, expected_size, total_read, SUPER_BLOCK_SIZE + )); + } else if dat_size as i64 != expected_size { + broken.push(format!( + "warning: dat file size {} does not match expected {} (total_read {} + super_block {})", + dat_size, expected_size, total_read, SUPER_BLOCK_SIZE + )); + } + + Ok((files_checked, broken)) + } + + /// Scan raw needle entries from the .dat file starting at `from_offset`. + /// Returns (needle_header_bytes, needle_body_bytes, append_at_ns) for each needle. + /// Used by VolumeTailSender to stream raw bytes. + pub fn scan_raw_needles_from( + &self, + from_offset: u64, + ) -> Result, Vec, u64)>, VolumeError> { + let version = self.version(); + let dat_size = self.current_dat_file_size()?; + let mut entries = Vec::new(); + let mut offset = from_offset; + + while offset < dat_size { + // Read needle header (16 bytes) + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header, offset) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + + let (_cookie, _id, size) = Needle::parse_header(&header); + if size.0 == 0 && _id.is_empty() { + break; + } + + let body_length = needle::needle_body_length(size, version); + let total_size = NEEDLE_HEADER_SIZE as u64 + body_length as u64; + + // Match Go's ScanVolumeFileFrom: visit ALL needles including deleted ones. + // This is critical for incremental copy where tombstones must be propagated. + + // Read body bytes + let mut body = vec![0u8; body_length as usize]; + match self.read_exact_at_backend(&mut body, offset + NEEDLE_HEADER_SIZE as u64) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + + // Parse the needle to get append_at_ns + let mut full = vec![0u8; total_size as usize]; + full[..NEEDLE_HEADER_SIZE].copy_from_slice(&header); + full[NEEDLE_HEADER_SIZE..].copy_from_slice(&body); + let mut n = Needle::default(); + let _ = n.read_bytes(&full, offset as i64, size, version); + + entries.push((header.to_vec(), body, n.append_at_ns)); + offset += total_size; + } + + Ok(entries) + } + + /// Insert or update a needle index entry (for low-level blob writes). + pub fn put_needle_index( + &mut self, + key: NeedleId, + offset: Offset, + size: Size, + ) -> Result<(), VolumeError> { + if let Some(ref mut nm) = self.nm { + nm.put(key, offset, size).map_err(VolumeError::Io)?; + } + Ok(()) + } + + /// Mark this volume as read-only (no writes or deletes). + /// If `persist` is true, the readonly state is saved to the .vif file. + pub fn set_read_only(&mut self) -> Result<(), VolumeError> { + self.no_write_or_delete = true; + self.save_vif() + } + + /// Mark this volume as read-only, optionally persisting to .vif. + pub fn set_read_only_persist(&mut self, persist: bool) -> Result<(), VolumeError> { + self.no_write_or_delete = true; + if persist { + self.save_vif()?; + } + Ok(()) + } + + /// Mark this volume as writable (allow writes and deletes). + pub fn set_writable(&mut self) -> Result<(), VolumeError> { + self.no_write_or_delete = false; + self.save_vif() + } + + /// Recompute the Go-style write/delete mode from the current remote tier state. + pub fn refresh_remote_write_mode(&mut self) { + self.has_remote_file = !self.volume_info.files.is_empty(); + if self.has_remote_file { + self.no_write_can_delete = true; + self.no_write_or_delete = false; + } else { + self.no_write_can_delete = false; + } + } + + /// Close the local .dat file handle (matches Go's v.DataBackend.Close() in LoadRemoteFile). + /// Called after tier-upload when the local file is being replaced by remote storage. + pub fn close_local_dat_backend(&mut self) { + self.dat_file = None; + } + + /// Close the remote dat file backend (matches Go's v.DataBackend.Close(); v.DataBackend = nil). + /// Called after tier-download when the remote backend is being replaced by local storage. + pub fn close_remote_dat_backend(&mut self) { + self.remote_dat_file = None; + } + + /// Path to .vif file. + fn vif_path(&self) -> String { + format!("{}.vif", self.data_file_name()) + } + + /// Load volume info from .vif file. + /// Supports both the protobuf-JSON format (Go-compatible) and legacy JSON. + /// Returns true if a .vif file was found and successfully loaded. + fn load_vif(&mut self) -> Result { + let path = self.vif_path(); + if let Ok(content) = fs::read_to_string(&path) { + if content.trim().is_empty() { + return Ok(false); + } + // Try protobuf-JSON (Go-compatible VolumeInfo via VifVolumeInfo) + if let Ok(vif_info) = serde_json::from_str::(&content) { + let pb_info = vif_info.to_pb(); + if pb_info.read_only { + self.no_write_or_delete = true; + } + self.volume_info = pb_info; + self.refresh_remote_write_mode(); + if self.volume_info.version == 0 { + self.volume_info.version = Version::current().0 as u32; + } + if !self.has_remote_file && self.volume_info.bytes_offset == 0 { + self.volume_info.bytes_offset = OFFSET_SIZE as u32; + } + if self.volume_info.bytes_offset != 0 + && self.volume_info.bytes_offset != OFFSET_SIZE as u32 + { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "bytes_offset mismatch in {}: found {}, expected {}", + path, self.volume_info.bytes_offset, OFFSET_SIZE + ), + ))); + } + return Ok(true); + } + // Fall back to OldVersionVolumeInfo (Go's tryOldVersionVolumeInfo): + // maps DestroyTime -> expire_at_sec + if let Ok(old_info) = serde_json::from_str::(&content) { + let vif_info = old_info.to_vif(); + let pb_info = vif_info.to_pb(); + if pb_info.read_only { + self.no_write_or_delete = true; + } + self.volume_info = pb_info; + self.refresh_remote_write_mode(); + if self.volume_info.version == 0 { + self.volume_info.version = Version::current().0 as u32; + } + if !self.has_remote_file && self.volume_info.bytes_offset == 0 { + self.volume_info.bytes_offset = OFFSET_SIZE as u32; + } + if self.volume_info.bytes_offset != 0 + && self.volume_info.bytes_offset != OFFSET_SIZE as u32 + { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "bytes_offset mismatch in {}: found {}, expected {}", + path, self.volume_info.bytes_offset, OFFSET_SIZE + ), + ))); + } + return Ok(true); + } + // Fall back to legacy format + if let Ok(info) = serde_json::from_str::(&content) { + if info.read_only { + self.no_write_or_delete = true; + } + return Ok(true); + } + } + Ok(false) + } + + /// Save volume info to .vif file in protobuf-JSON format (Go-compatible). + /// Matches Go's SaveVolumeInfo: checks writability before writing and propagates errors. + fn save_vif(&self) -> Result<(), VolumeError> { + let vif_path = self.vif_path(); + + // Match Go: if file exists but is not writable, return an error + let path = std::path::Path::new(&vif_path); + if path.exists() { + let metadata = fs::metadata(path)?; + if metadata.permissions().readonly() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::PermissionDenied, + format!("failed to check {} not writable", vif_path), + ))); + } + } + + let mut vif = VifVolumeInfo::from_pb(&self.volume_info); + vif.read_only = self.no_write_or_delete; + + // Match Go's SaveVolumeInfo: compute ExpireAtSec from TTL + let ttl_seconds = self.super_block.ttl.to_seconds(); + if ttl_seconds > 0 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + vif.expire_at_sec = now + ttl_seconds; + } + + let content = serde_json::to_string_pretty(&vif) + .map_err(|e| VolumeError::Io(io::Error::new(io::ErrorKind::Other, e.to_string())))?; + fs::write(&vif_path, content)?; + Ok(()) + } + + /// Save full VolumeInfo to .vif file (for tiered storage). + /// Matches Go's SaveVolumeInfo which computes ExpireAtSec from TTL. + pub fn save_volume_info(&mut self) -> Result<(), VolumeError> { + self.volume_info.read_only = self.no_write_or_delete; + + // Compute ExpireAtSec from TTL (matches Go's SaveVolumeInfo) + let ttl_seconds = self.super_block.ttl.to_seconds(); + if ttl_seconds > 0 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + self.volume_info.expire_at_sec = now + ttl_seconds; + } + + let vif = VifVolumeInfo::from_pb(&self.volume_info); + let content = serde_json::to_string_pretty(&vif) + .map_err(|e| VolumeError::Io(io::Error::new(io::ErrorKind::Other, e.to_string())))?; + fs::write(&self.vif_path(), content)?; + Ok(()) + } + + /// Get the remote storage backend name and key from this volume .vif. + pub fn remote_storage_name_key(&self) -> (String, String) { + if self.volume_info.files.is_empty() { + return (String::new(), String::new()); + } + let rf = &self.volume_info.files[0]; + let backend_name = if rf.backend_id.is_empty() { + rf.backend_type.clone() + } else { + format!("{}.{}", rf.backend_type, rf.backend_id) + }; + (backend_name, rf.key.clone()) + } + + /// Get the dat file path for this volume. + pub fn dat_path(&self) -> String { + self.file_name(".dat") + } + + /// Get the directory this volume is stored in. + pub fn dir(&self) -> &str { + &self.dir + } + + /// Throttle IO during compaction to avoid saturating disk. + pub fn maybe_throttle_compaction(&self, bytes_written: u64) { + if self.compaction_byte_per_second <= 0 || !self.is_compacting { + return; + } + // Simple throttle: sleep based on bytes written vs allowed rate + let sleep_us = + (bytes_written as f64 / self.compaction_byte_per_second as f64 * 1_000_000.0) as u64; + if sleep_us > 0 { + std::thread::sleep(std::time::Duration::from_micros(sleep_us)); + } + } + + /// Change the replication placement and rewrite the super block. + pub fn set_replica_placement(&mut self, rp: ReplicaPlacement) -> Result<(), VolumeError> { + self.super_block.replica_placement = rp; + let bytes = self.super_block.to_bytes(); + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + dat_file.seek(SeekFrom::Start(0))?; + dat_file.write_all(&bytes)?; + dat_file.sync_all()?; + Ok(()) + } + + // ---- Binary search for incremental copy ---- + + /// Read a single index entry's offset from the .idx file by entry index. + fn read_offset_from_index(&self, m: i64) -> Result { + let idx_path = self.file_name(".idx"); + let idx_file = File::open(&idx_path)?; + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + let file_offset = m as u64 * NEEDLE_MAP_ENTRY_SIZE as u64; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + idx_file.read_exact_at(&mut buf, file_offset)?; + } + #[cfg(not(unix))] + { + let mut f = idx_file; + f.seek(SeekFrom::Start(file_offset))?; + std::io::Read::read_exact(&mut f, &mut buf)?; + } + let (_key, offset, _size) = idx_entry_from_bytes(&buf); + Ok(offset) + } + + /// Read the append_at_ns timestamp from a needle at the given offset in the .dat file. + /// Go reads the full needle body for ALL entries including tombstones to get the + /// actual AppendAtNs timestamp, which is needed for correct binary search during + /// incremental copy. + fn read_append_at_ns(&self, offset: Offset) -> Result { + let actual_offset = offset.to_actual_offset() as u64; + let version = self.version(); + + let mut header_buf = [0u8; NEEDLE_HEADER_SIZE]; + self.read_exact_at_backend(&mut header_buf, actual_offset)?; + + let (_cookie, _id, size) = Needle::parse_header(&header_buf); + + let actual_size = get_actual_size(size, version); + if actual_size <= 0 { + return Ok(0); + } + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, actual_offset)?; + + let mut n = Needle::default(); + n.read_bytes_meta_only(&mut buf, offset.to_actual_offset(), size, version)?; + Ok(n.append_at_ns) + } + + /// Search right from position m to find the first non-deleted entry. + fn read_right_ns(&self, m: i64, max: i64) -> Result<(i64, Offset, u64), VolumeError> { + let mut index = m; + loop { + index += 1; + if index >= max { + return Ok((index, Offset::default(), 0)); + } + let offset = self.read_offset_from_index(index)?; + if !offset.is_zero() { + let ts = self.read_append_at_ns(offset)?; + return Ok((index, offset, ts)); + } + } + } + + /// Search left from position m to find the first non-deleted entry. + fn read_left_ns(&self, m: i64) -> Result<(i64, Offset, u64), VolumeError> { + let mut index = m; + loop { + index -= 1; + if index < 0 { + return Ok((index, Offset::default(), 0)); + } + let offset = self.read_offset_from_index(index)?; + if !offset.is_zero() { + let ts = self.read_append_at_ns(offset)?; + return Ok((index, offset, ts)); + } + } + } + + /// Binary search through the .idx file to find the first needle + /// with append_at_ns > since_ns. Returns (offset, is_last). + /// Matches Go's BinarySearchByAppendAtNs in volume_backup.go. + pub fn binary_search_by_append_at_ns( + &self, + since_ns: u64, + ) -> Result<(Offset, bool), VolumeError> { + let file_size = self.idx_file_size() as i64; + if file_size % NEEDLE_MAP_ENTRY_SIZE as i64 != 0 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected idx file size: {}", file_size), + ))); + } + + let entry_count = file_size / NEEDLE_MAP_ENTRY_SIZE as i64; + let mut l: i64 = 0; + let mut h: i64 = entry_count; + + while l < h { + let m = (l + h) / 2; + + if m == entry_count { + return Ok((Offset::default(), true)); + } + + let offset = self.read_offset_from_index(m)?; + + if offset.is_zero() { + let (left_index, _left_offset, left_ns) = self.read_left_ns(m)?; + let (right_index, right_offset, right_ns) = self.read_right_ns(m, entry_count)?; + + if right_ns <= since_ns { + l = right_index; + if l == entry_count { + return Ok((Offset::default(), true)); + } else { + continue; + } + } + if since_ns < left_ns { + h = left_index + 1; + continue; + } + return Ok((right_offset, false)); + } + + let m_ns = self.read_append_at_ns(offset)?; + + if m_ns <= since_ns { + l = m + 1; + } else { + h = m; + } + } + + if l == entry_count { + return Ok((Offset::default(), true)); + } + + let offset = self.read_offset_from_index(l)?; + Ok((offset, false)) + } + + /// Write a raw needle blob at a specific offset in the .dat file. + pub fn write_needle_blob( + &mut self, + offset: i64, + needle_blob: &[u8], + ) -> Result<(), VolumeError> { + if self.is_read_only() { + return Err(VolumeError::ReadOnly); + } + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + dat_file.seek(SeekFrom::Start(offset as u64))?; + dat_file.write_all(needle_blob)?; + Ok(()) + } + + /// Write a needle blob and update the needle map index. + /// Matches Go's Volume.WriteNeedleBlob which appends to dat and calls nm.Put. + pub fn write_needle_blob_and_index( + &mut self, + needle_id: NeedleId, + needle_blob: &[u8], + size: Size, + ) -> Result<(), VolumeError> { + // Dedup check: if the same needle already exists with matching content, skip the write. + // Matches Go's WriteNeedleBlob which reads existing needle and compares cookie+checksum+data. + if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(needle_id) { + if nv.size == size { + let version = self.version(); + // Read existing needle from disk + let mut old_needle = Needle::default(); + let mut ro = ReadOption::default(); + if self + .read_needle_data_at_unlocked( + &mut old_needle, + nv.offset.to_actual_offset(), + nv.size, + &mut ro, + ) + .is_ok() + { + // Parse the incoming blob into a needle + let mut new_needle = Needle::default(); + if new_needle + .read_bytes(needle_blob, nv.offset.to_actual_offset(), size, version) + .is_ok() + { + if old_needle.cookie == new_needle.cookie + && old_needle.checksum == new_needle.checksum + && old_needle.data == new_needle.data + { + return Ok(()); + } + } + } + } + } + } + + // Check volume size limit + let content_size = self.content_size(); + if MAX_POSSIBLE_VOLUME_SIZE < content_size + needle_blob.len() as u64 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!( + "volume size limit {} exceeded! current size is {}", + MAX_POSSIBLE_VOLUME_SIZE, content_size + ), + ))); + } + + // Compute monotonic appendAtNs (matches Go: needle.GetAppendAtNs(v.lastAppendAtNs)) + let append_at_ns = get_append_at_ns(self.last_append_at_ns); + + // Patch appendAtNs timestamp into V3 blobs (matches Go WriteNeedleBlob L64-77) + let mut blob_buf; + let blob_to_write = if self.version() == VERSION_3 { + let ts_offset = + NEEDLE_HEADER_SIZE + size.0 as usize + NEEDLE_CHECKSUM_SIZE; + if ts_offset + TIMESTAMP_SIZE > needle_blob.len() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "needle blob buffer too small: need {} bytes, have {}", + ts_offset + TIMESTAMP_SIZE, + needle_blob.len() + ), + ))); + } + blob_buf = needle_blob.to_vec(); + blob_buf[ts_offset..ts_offset + TIMESTAMP_SIZE] + .copy_from_slice(&append_at_ns.to_be_bytes()); + &blob_buf[..] + } else { + needle_blob + }; + + // Append blob at end of dat file + let dat_size = self.dat_file_size()? as i64; + self.write_needle_blob(dat_size, blob_to_write)?; + + // Update lastAppendAtNs (matches Go L352: v.lastAppendAtNs = appendAtNs) + self.last_append_at_ns = append_at_ns; + + // Update needle map index + let offset = Offset::from_actual_offset(dat_size); + if let Some(ref mut nm) = self.nm { + nm.put(needle_id, offset, size)?; + } + + Ok(()) + } + + pub fn needs_replication(&self) -> bool { + self.super_block.replica_placement.get_copy_count() > 1 + } + + /// Garbage ratio: deleted_size / content_size (matching Go's garbageLevel). + /// content_size is the additive-only FileByteCounter. + /// + /// When DeletedCount > 0 but DeletedSize == 0 (e.g. .sdx converted back to + /// normal .idx where deleted entry sizes are missing), falls back to + /// computing deleted bytes as (datFileSize - contentSize - SuperBlockSize) + /// and uses datFileSize as the denominator. + pub fn garbage_level(&self) -> f64 { + let content = self.content_size(); + if content == 0 { + return 0.0; + } + let mut deleted = self.deleted_size(); + let mut file_size = content; + + if self.deleted_count() > 0 && deleted == 0 { + // This happens for .sdx converted back to normal .idx + // where deleted entry size is missing + let dat_file_size = self.dat_file_size().unwrap_or(0); + deleted = dat_file_size.saturating_sub(content).saturating_sub(SUPER_BLOCK_SIZE as u64); + file_size = dat_file_size; + } + + if file_size == 0 { + return 0.0; + } + deleted as f64 / file_size as f64 + } + + pub fn dat_file_size(&self) -> io::Result { + self.current_dat_file_size() + } + + /// Get the modification time of the .dat file as Unix seconds. + pub fn dat_file_mod_time(&self) -> u64 { + if let Some(dat_file) = self.dat_file.as_ref() { + dat_file + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_secs()) + .unwrap_or(0) + } else { + self.remote_dat_file + .as_ref() + .map(|remote_dat_file| remote_dat_file.modified_time) + .unwrap_or(0) + } + } + + pub fn idx_file_size(&self) -> u64 { + self.nm.as_ref().map_or(0, |nm| nm.index_file_size()) + } + + // ---- Compaction / Vacuum ---- + + /// Compact the volume by copying only live needles to new .cpd/.cpx files. + /// This reads from the current .dat/.idx and writes to .cpd/.cpx. + /// Call `commit_compact()` after to swap the files. + pub fn compact_by_index( + &mut self, + _preallocate: u64, + _max_bytes_per_second: i64, + progress_fn: F, + ) -> Result<(), VolumeError> + where + F: Fn(i64) -> bool, + { + if self.is_compacting { + return Ok(()); // already compacting + } + self.is_compacting = true; + + let result = self.do_compact_by_index(progress_fn); + + self.is_compacting = false; + result + } + + fn do_compact_by_index(&mut self, progress_fn: F) -> Result<(), VolumeError> + where + F: Fn(i64) -> bool, + { + // Guard against nil needle map (matches Go's nil check before compaction sync) + if self.nm.is_none() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!("volume {} needle map is nil", self.id), + ))); + } + + // Record state before compaction for makeupDiff + self.last_compact_index_offset = self.nm.as_ref().map_or(0, |nm| nm.index_file_size()); + self.last_compact_revision = self.super_block.compaction_revision; + + // Sync current data + self.sync_to_disk()?; + + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + let version = self.version(); + + // Write new super block with incremented compaction revision + let mut new_sb = self.super_block.clone(); + new_sb.compaction_revision += 1; + let sb_bytes = new_sb.to_bytes(); + + let mut dst = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&cpd_path)?; + dst.write_all(&sb_bytes)?; + let mut new_offset = sb_bytes.len() as i64; + + // Build new index in memory + let mut new_nm = CompactNeedleMap::new(); + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + // Collect live entries from needle map (sorted ascending) + let nm = self.nm.as_ref().ok_or(VolumeError::NotInitialized)?; + let mut entries: Vec<(NeedleId, Offset, Size)> = Vec::new(); + for (id, nv) in nm.iter_entries() { + if nv.offset.is_zero() || nv.size.is_deleted() { + continue; + } + entries.push((id, nv.offset, nv.size)); + } + entries.sort_by_key(|(_, offset, _)| *offset); + + for (id, offset, size) in entries { + // Progress callback + if !progress_fn(offset.to_actual_offset()) { + // Interrupted + let _ = fs::remove_file(&cpd_path); + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Interrupted, + "compaction interrupted", + ))); + } + + // Read needle from source + let mut n = Needle { + id, + ..Needle::default() + }; + self.read_needle_data_at(&mut n, offset.to_actual_offset(), size)?; + + // Skip TTL-expired needles using the volume's TTL (matches Go's volume_vacuum.go) + if n.has_ttl() { + let ttl_minutes = self.super_block.ttl.minutes(); + if ttl_minutes > 0 && n.last_modified > 0 { + let expire_at = n.last_modified + (ttl_minutes as u64) * 60; + if now >= expire_at { + continue; + } + } + } + + // Write needle to destination + let bytes = n.write_bytes(version); + dst.write_all(&bytes)?; + + // Update new index + new_nm.put(id, Offset::from_actual_offset(new_offset), n.size)?; + new_offset += bytes.len() as i64; + } + + dst.sync_all()?; + + // Save new index + new_nm.save_to_idx(&cpx_path)?; + + Ok(()) + } + + /// Commit a previously completed compaction: swap .cpd/.cpx to .dat/.idx and reload. + /// Matches Go's isCompactionInProgress CompareAndSwap guard. + pub fn commit_compact(&mut self) -> Result<(), VolumeError> { + if self.is_compacting { + return Ok(()); // already compacting, silently skip (matches Go) + } + self.is_compacting = true; + + let result = self.do_commit_compact(); + + self.is_compacting = false; + result + } + + fn do_commit_compact(&mut self) -> Result<(), VolumeError> { + if let Err(e) = self.makeup_diff() { + warn!("makeup_diff failed: {}", e); + // Match Go: clean up .cpd/.cpx on makeup_diff failure + let cpd = self.file_name(".cpd"); + let cpx = self.file_name(".cpx"); + let _ = fs::remove_file(&cpd); + let _ = fs::remove_file(&cpx); + return Err(e); + } + + // Close current files + if let Some(ref mut nm) = self.nm { + nm.close(); + } + self.nm = None; + if let Some(ref dat_file) = self.dat_file { + let _ = dat_file.sync_all(); + } + self.dat_file = None; + self.remote_dat_file = None; + + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + let dat_path = self.file_name(".dat"); + let idx_path = self.file_name(".idx"); + + // Check that compact files exist + if !Path::new(&cpd_path).exists() || !Path::new(&cpx_path).exists() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + "compact files (.cpd/.cpx) not found", + ))); + } + + // Swap files: .cpd → .dat, .cpx → .idx + fs::rename(&cpd_path, &dat_path)?; + fs::rename(&cpx_path, &idx_path)?; + + // Remove any leveldb/redb index files (rebuilt from .idx on reload) + let ldb_path = self.file_name(".ldb"); + let _ = fs::remove_dir_all(&ldb_path); + let rdb_path = self.file_name(".rdb"); + let _ = fs::remove_file(&rdb_path); + + // Reload + self.load(true, false, 0, self.version())?; + + Ok(()) + } + + /// Clean up leftover compaction files (.cpd, .cpx). + pub fn cleanup_compact(&self) -> Result<(), VolumeError> { + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + let cpldb_path = self.file_name(".cpldb"); + + let e1 = fs::remove_file(&cpd_path); + let e2 = fs::remove_file(&cpx_path); + let e3 = fs::remove_dir_all(&cpldb_path); + + // Ignore NotFound errors + if let Err(e) = e1 { + if e.kind() != io::ErrorKind::NotFound { + return Err(e.into()); + } + } + if let Err(e) = e2 { + if e.kind() != io::ErrorKind::NotFound { + return Err(e.into()); + } + } + if let Err(e) = e3 { + if e.kind() != io::ErrorKind::NotFound { + return Err(e.into()); + } + } + + Ok(()) + } + + /// Read any new needles appended during compaction and append them to .cpd/.cpx + fn makeup_diff(&mut self) -> Result<(), VolumeError> { + let old_idx_size = self.nm.as_ref().map_or(0, |nm| nm.index_file_size()); + if old_idx_size == 0 || old_idx_size <= self.last_compact_index_offset { + return Ok(()); + } + + let old_super_block = &self.super_block; + if old_super_block.compaction_revision != self.last_compact_revision { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!( + "current old dat file's compact revision {} is not the expected one {}", + old_super_block.compaction_revision, self.last_compact_revision + ), + ))); + } + + // Read the new .cpd file's super block and verify its compaction revision is old + 1 + let cpd_path_check = self.file_name(".cpd"); + let mut cpd_file_check = File::open(&cpd_path_check)?; + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + cpd_file_check.read_exact(&mut sb_buf)?; + let new_super_block = SuperBlock::from_bytes(&sb_buf)?; + let old_compact_revision = old_super_block.compaction_revision; + let new_compact_revision = new_super_block.compaction_revision; + if old_compact_revision + 1 != new_compact_revision { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!( + "old dat file's compact revision {} + 1 does not equal new dat file's compact revision {}", + old_compact_revision, new_compact_revision + ), + ))); + } + + let old_idx_path = self.file_name(".idx"); + let mut old_idx_file = File::open(&old_idx_path)?; + + // Read new entries from .idx + let mut incremented_entries = std::collections::HashMap::new(); + let offset = self.last_compact_index_offset; + + old_idx_file.seek(SeekFrom::Start(offset))?; + let entry_count = (old_idx_size - offset) / NEEDLE_MAP_ENTRY_SIZE as u64; + for _ in 0..entry_count { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + old_idx_file.read_exact(&mut buf)?; + let (key, needle_offset, size) = crate::storage::types::idx_entry_from_bytes(&buf); + incremented_entries.insert(key, (needle_offset, size)); + } + + if incremented_entries.is_empty() { + return Ok(()); + } + + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + + let mut dst_dat = OpenOptions::new().read(true).write(true).open(&cpd_path)?; + let mut dst_idx = OpenOptions::new() + .write(true) + .append(true) + .open(&cpx_path)?; + + let mut dat_offset = dst_dat.seek(SeekFrom::End(0))?; + let padding_rem = dat_offset % NEEDLE_PADDING_SIZE as u64; + if padding_rem != 0 { + dat_offset += NEEDLE_PADDING_SIZE as u64 - padding_rem; + dst_dat.seek(SeekFrom::Start(dat_offset))?; + } + + let version = self.version(); + let old_dat_path = self.file_name(".dat"); + let old_dat_file = File::open(&old_dat_path)?; + + for (key, (needle_offset, size)) in incremented_entries { + if !needle_offset.is_zero() && !size.is_deleted() && size.0 > 0 { + let actual_size = crate::storage::needle::needle::get_actual_size(size, version); + let mut blob = vec![0u8; actual_size as usize]; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + old_dat_file + .read_exact_at(&mut blob, needle_offset.to_actual_offset() as u64)?; + } + #[cfg(windows)] + { + crate::storage::volume::read_exact_at( + &old_dat_file, + &mut blob, + needle_offset.to_actual_offset() as u64, + )?; + } + + dst_dat.write_all(&blob)?; + + let mut idx_entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + crate::storage::types::idx_entry_to_bytes( + &mut idx_entry_buf, + key, + Offset::from_actual_offset(dat_offset as i64), + size, + ); + dst_idx.write_all(&idx_entry_buf)?; + + dat_offset += actual_size as u64; + } else { + let mut fake_del_needle = Needle { + id: key, + cookie: Cookie(0x12345678), + append_at_ns: SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as u64, + ..Needle::default() + }; + let bytes = fake_del_needle.write_bytes(version); + dst_dat.write_all(&bytes)?; + + let mut idx_entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + crate::storage::types::idx_entry_to_bytes( + &mut idx_entry_buf, + key, + Offset::from_actual_offset(0), + Size(crate::storage::types::TOMBSTONE_FILE_SIZE.into()), + ); + dst_idx.write_all(&idx_entry_buf)?; + + dat_offset += bytes.len() as u64; + } + } + + dst_dat.sync_all()?; + dst_idx.sync_all()?; + + Ok(()) + } + + // ---- Sync / Close ---- + + pub fn sync_to_disk(&mut self) -> io::Result<()> { + if let Some(ref dat_file) = self.dat_file { + dat_file.sync_all()?; + } + if let Some(ref nm) = self.nm { + nm.sync()?; + } + Ok(()) + } + + pub fn close(&mut self) { + if let Some(ref dat_file) = self.dat_file { + let _ = dat_file.sync_all(); + } + self.dat_file = None; + self.remote_dat_file = None; + if let Some(ref nm) = self.nm { + let _ = nm.sync(); + } + self.nm = None; + } + + /// Remove all volume files from disk. + pub fn destroy(&mut self, only_empty: bool) -> Result<(), VolumeError> { + if only_empty && self.file_count() > 0 { + return Err(VolumeError::NotEmpty); + } + if self.is_compacting { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!("volume {} is compacting", self.id), + ))); + } + + let (storage_name, storage_key) = self.remote_storage_name_key(); + if self.has_remote_file && !storage_name.is_empty() && !storage_key.is_empty() { + let backend = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap() + .get(&storage_name); + if let Some(backend) = backend { + if let Err(e) = backend.delete_file_blocking(&storage_key) { + warn!( + volume_id = self.id.0, + storage_name, + storage_key, + error = %e, + "failed to delete remote tier file during destroy" + ); + } + } else { + warn!( + volume_id = self.id.0, + storage_name, storage_key, "remote tier backend not found during destroy" + ); + } + } + + self.close(); + remove_volume_files(&self.data_file_name()); + remove_volume_files(&self.index_file_name()); + Ok(()) + } + + /// Check if an I/O error is EIO (errno 5) and record it for health monitoring. + /// On success (None), clears any previously recorded EIO error. + /// Matches Go's `checkReadWriteError` in volume_write.go. + fn check_read_write_error(&self, err: Option<&io::Error>) { + if let Some(e) = err { + if e.raw_os_error() == Some(5) { + // EIO — record it + if let Ok(mut guard) = self.last_io_error.lock() { + *guard = Some(e.to_string()); + } + } + } else { + // Success — clear any previous EIO + if let Ok(mut guard) = self.last_io_error.lock() { + if guard.is_some() { + *guard = None; + } + } + } + } + + /// Returns the last recorded I/O error string, if any. + #[allow(dead_code)] + pub fn last_io_error(&self) -> Option { + self.last_io_error.lock().ok()?.clone() + } + + #[cfg(test)] + pub(crate) fn set_last_io_error_for_test(&self, err: Option<&str>) { + if let Ok(mut guard) = self.last_io_error.lock() { + *guard = err.map(|value| value.to_string()); + } + } + + #[cfg(test)] + pub(crate) fn set_last_modified_ts_for_test(&mut self, ts_seconds: u64) { + self.last_modified_ts_seconds = ts_seconds; + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Generate volume file base name: dir/collection_id or dir/id +pub fn volume_file_name(dir: &str, collection: &str, id: VolumeId) -> String { + if collection.is_empty() { + format!("{}/{}", dir, id.0) + } else { + format!("{}/{}_{}", dir, collection, id.0) + } +} + +/// Generate a monotonically increasing append timestamp. +fn get_append_at_ns(last: u64) -> u64 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + if now <= last { + last + 1 + } else { + now + } +} + +/// Remove all files associated with a volume. +pub(crate) fn remove_volume_files(base: &str) { + for ext in &[ + ".dat", ".idx", ".vif", ".sdx", ".cpd", ".cpx", ".note", ".rdb", + ] { + let _ = fs::remove_file(format!("{}{}", base, ext)); + } + // leveldb uses a directory + let _ = fs::remove_dir_all(format!("{}.ldb", base)); +} + +// ============================================================================ +// ScanVolumeFile — iterate all needles in a .dat file +// ============================================================================ + +/// Callback for scanning needles in a volume file. +pub trait VolumeFileVisitor { + fn visit_super_block(&mut self, sb: &SuperBlock) -> Result<(), VolumeError>; + fn read_needle_body(&self) -> bool; + fn visit_needle(&mut self, n: &Needle, offset: i64) -> Result<(), VolumeError>; +} + +/// Scan all needles in a volume's .dat file. +pub fn scan_volume_file( + dat_path: &str, + visitor: &mut dyn VolumeFileVisitor, +) -> Result<(), VolumeError> { + let mut file = File::open(dat_path)?; + + // Read super block + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + file.read_exact(&mut sb_buf)?; + let sb = SuperBlock::from_bytes(&sb_buf)?; + visitor.visit_super_block(&sb)?; + + let version = sb.version; + let mut offset = sb.block_size() as i64; + + loop { + // Read needle header + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + file.seek(SeekFrom::Start(offset as u64))?; + match file.read_exact(&mut header) { + Ok(()) => {} + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + + let (_cookie, _id, size) = Needle::parse_header(&header); + + if size.0 == 0 && _id.is_empty() { + break; // end of valid data + } + + let body_length = needle::needle_body_length(size, version); + let total_size = NEEDLE_HEADER_SIZE as i64 + body_length; + + // Skip full body parsing for deleted needles (tombstone or negative size) + if size.is_deleted() || size.0 <= 0 { + let mut n = Needle::default(); + n.read_header(&header); + visitor.visit_needle(&n, offset)?; + } else if visitor.read_needle_body() { + let mut buf = vec![0u8; total_size as usize]; + file.seek(SeekFrom::Start(offset as u64))?; + file.read_exact(&mut buf)?; + + let mut n = Needle::default(); + n.read_bytes(&buf, offset, size, version)?; + visitor.visit_needle(&n, offset)?; + } else { + let mut n = Needle::default(); + n.read_header(&header); + visitor.visit_needle(&n, offset)?; + } + + offset += total_size; + } + + Ok(()) +} + +/// Reserve disk blocks for a file without changing its visible size. +/// On Linux, uses `fallocate(FALLOC_FL_KEEP_SIZE)` to actually reserve blocks. +/// On other platforms, this is a no-op. +fn preallocate_file(file: &File, size: u64) { + #[cfg(target_os = "linux")] + { + use std::os::unix::io::AsRawFd; + let fd = file.as_raw_fd(); + // FALLOC_FL_KEEP_SIZE = 1: allocate blocks without changing file size + let ret = unsafe { libc::fallocate(fd, 1, 0, size as libc::off_t) }; + if ret == 0 { + tracing::info!(bytes = size, "preallocated disk space"); + } else { + tracing::warn!( + bytes = size, + error = %io::Error::last_os_error(), + "fallocate failed" + ); + } + } + #[cfg(not(target_os = "linux"))] + { + let _ = (file, size); + tracing::debug!(bytes = size, "preallocation not supported on this platform"); + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::needle::crc::CRC; + use tempfile::TempDir; + + fn spawn_fake_s3_server(body: Vec) -> (String, tokio::sync::oneshot::Sender<()>) { + use axum::http::{header, HeaderMap, HeaderValue, StatusCode}; + use axum::routing::any; + use axum::Router; + + let body = Arc::new(body); + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + listener.set_nonblocking(true).unwrap(); + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); + + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + runtime.block_on(async move { + let app = Router::new().fallback(any(move |headers: HeaderMap| { + let body = body.clone(); + async move { + let bytes = body.as_ref(); + if let Some(range) = headers + .get(header::RANGE) + .and_then(|value| value.to_str().ok()) + { + if let Some(spec) = range.strip_prefix("bytes=") { + let (start, end) = spec.split_once('-').unwrap(); + let start = start.parse::().unwrap(); + let end = if end.is_empty() { + bytes.len().saturating_sub(1) + } else { + end.parse::().unwrap() + } + .min(bytes.len().saturating_sub(1)); + let chunk = bytes[start..=end].to_vec(); + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&chunk.len().to_string()).unwrap(), + ); + response_headers.insert( + header::CONTENT_RANGE, + HeaderValue::from_str(&format!( + "bytes {}-{}/{}", + start, + end, + bytes.len() + )) + .unwrap(), + ); + return (StatusCode::PARTIAL_CONTENT, response_headers, chunk); + } + } + + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&bytes.len().to_string()).unwrap(), + ); + (StatusCode::OK, response_headers, bytes.to_vec()) + } + })); + + let listener = tokio::net::TcpListener::from_std(listener).unwrap(); + axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.await; + }) + .await + .unwrap(); + }); + }); + + (format!("http://{}", addr), shutdown_tx) + } + + fn make_test_volume(dir: &str) -> Volume { + Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap() + } + + #[test] + fn test_data_file_access_control_blocks_writer_until_reader_releases() { + let control = Arc::new(DataFileAccessControl::default()); + let read_lease = control.read_lock(); + let writer_control = control.clone(); + let acquired = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let acquired_clone = acquired.clone(); + + let writer = std::thread::spawn(move || { + let _write_lease = writer_control.write_lock(); + acquired_clone.store(true, std::sync::atomic::Ordering::Relaxed); + }); + + std::thread::sleep(std::time::Duration::from_millis(50)); + assert!(!acquired.load(std::sync::atomic::Ordering::Relaxed)); + + drop(read_lease); + writer.join().unwrap(); + + assert!(acquired.load(std::sync::atomic::Ordering::Relaxed)); + } + + #[test] + fn test_volume_file_name() { + assert_eq!(volume_file_name("/data", "", VolumeId(1)), "/data/1"); + assert_eq!( + volume_file_name("/data", "pics", VolumeId(42)), + "/data/pics_42" + ); + } + + #[test] + fn test_volume_create_and_load() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let v = make_test_volume(dir); + assert_eq!(v.version(), VERSION_3); + assert_eq!(v.file_count(), 0); + assert_eq!(v.content_size(), 0); + + // .dat and .idx files should exist + assert!(Path::new(&v.file_name(".dat")).exists()); + assert!(Path::new(&v.file_name(".idx")).exists()); + } + + #[test] + fn test_volume_write_read() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write a needle + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0x12345678), + data: b"hello world".to_vec(), + data_size: 11, + flags: 0, + ..Needle::default() + }; + let (offset, size, unchanged) = v.write_needle(&mut n, true).unwrap(); + assert!(!unchanged); + assert!(offset > 0); // after superblock + assert!(size.0 > 0); + assert_eq!(v.file_count(), 1); + + // Read it back + let mut read_n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let count = v.read_needle(&mut read_n).unwrap(); + assert_eq!(count, 11); + assert_eq!(read_n.data, b"hello world"); + assert_eq!(read_n.cookie, Cookie(0x12345678)); + } + + #[test] + fn test_volume_write_dedup() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"same data".to_vec(), + data_size: 9, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write same needle again — should be unchanged + let mut n2 = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"same data".to_vec(), + data_size: 9, + ..Needle::default() + }; + n2.checksum = CRC::new(&n2.data); + let (_, _, unchanged) = v.write_needle(&mut n2, true).unwrap(); + assert!(unchanged); + } + + #[test] + fn test_volume_delete() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xbb), + data: b"delete me".to_vec(), + data_size: 9, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + assert_eq!(v.file_count(), 1); + + let deleted_size = v + .delete_needle(&mut Needle { + id: NeedleId(1), + cookie: Cookie(0xbb), + ..Needle::default() + }) + .unwrap(); + assert!(deleted_size.0 > 0); + // Additive-only: file_count stays at 1 after delete + assert_eq!(v.file_count(), 1); + assert_eq!(v.deleted_count(), 1); + + // Read should fail with Deleted + let mut read_n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let err = v.read_needle(&mut read_n).unwrap_err(); + assert!(matches!(err, VolumeError::Deleted)); + } + + #[test] + fn test_volume_multiple_needles() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + for i in 1..=10 { + let data = format!("needle data {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + + assert_eq!(v.file_count(), 10); + assert_eq!(v.max_file_key(), NeedleId(10)); + + // Read back needle 5 + let mut n = Needle { + id: NeedleId(5), + ..Needle::default() + }; + v.read_needle(&mut n).unwrap(); + assert_eq!(n.data, b"needle data 5"); + } + + #[test] + fn test_volume_reload_from_disk() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Write some needles + { + let mut v = make_test_volume(dir); + for i in 1..=3 { + let data = format!("data {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + } + + // Reload and verify + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert_eq!(v.file_count(), 3); + + let mut n = Needle { + id: NeedleId(2), + ..Needle::default() + }; + v.read_needle(&mut n).unwrap(); + assert_eq!(std::str::from_utf8(&n.data).unwrap(), "data 2"); + } + + #[test] + fn test_volume_cookie_mismatch() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"original".to_vec(), + data_size: 8, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write with wrong cookie + let mut n2 = Needle { + id: NeedleId(1), + cookie: Cookie(0xbb), + data: b"overwrite".to_vec(), + data_size: 9, + ..Needle::default() + }; + let err = v.write_needle(&mut n2, true).unwrap_err(); + assert!(matches!(err, VolumeError::CookieMismatch(_))); + } + + #[test] + fn test_volume_destroy() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let dat_path; + let idx_path; + + { + let mut v = make_test_volume(dir); + dat_path = v.file_name(".dat"); + idx_path = v.file_name(".idx"); + assert!(Path::new(&dat_path).exists()); + v.destroy(false).unwrap(); + } + + assert!(!Path::new(&dat_path).exists()); + assert!(!Path::new(&idx_path).exists()); + } + + #[test] + fn test_read_all_needles_uses_dat_order_for_live_offsets() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut first = Needle { + id: NeedleId(10), + cookie: Cookie(0x11223344), + data: b"first".to_vec(), + data_size: 5, + ..Needle::default() + }; + v.write_needle(&mut first, true).unwrap(); + + let mut second = Needle { + id: NeedleId(20), + cookie: Cookie(0x55667788), + data: b"second".to_vec(), + data_size: 6, + ..Needle::default() + }; + v.write_needle(&mut second, true).unwrap(); + + let mut first_overwrite = Needle { + id: NeedleId(10), + cookie: Cookie(0x11223344), + data: b"first-overwrite".to_vec(), + data_size: 15, + ..Needle::default() + }; + v.write_needle(&mut first_overwrite, true).unwrap(); + + let needles = v.read_all_needles().unwrap(); + let ids: Vec = needles.iter().map(|n| u64::from(n.id)).collect(); + let bodies: Vec<&[u8]> = needles.iter().map(|n| n.data.as_slice()).collect(); + + assert_eq!(ids, vec![20, 10]); + assert_eq!(bodies, vec![b"second".as_slice(), b"first-overwrite".as_slice()]); + } + + #[test] + fn test_get_append_at_ns() { + let t1 = get_append_at_ns(0); + assert!(t1 > 0); + let t2 = get_append_at_ns(t1); + assert!(t2 > t1); + // If we pass a future timestamp, should return last+1 + let future = u64::MAX - 1; + let t3 = get_append_at_ns(future); + assert_eq!(t3, future + 1); + } + + #[test] + fn test_volume_compact() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write 3 needles + for i in 1..=3u64 { + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: format!("data-{}", i).into_bytes(), + data_size: format!("data-{}", i).len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + assert_eq!(v.file_count(), 3); + + // Delete needle 2 + let mut del = Needle { + id: NeedleId(2), + cookie: Cookie(2), + ..Needle::default() + }; + v.delete_needle(&mut del).unwrap(); + // Additive-only: file_count stays at 3 after delete + assert_eq!(v.file_count(), 3); + assert_eq!(v.deleted_count(), 1); + + let dat_size_before = v.dat_file_size().unwrap(); + + // Compact + v.compact_by_index(0, 0, |_| true).unwrap(); + + // Verify compact files exist + assert!(Path::new(&v.file_name(".cpd")).exists()); + assert!(Path::new(&v.file_name(".cpx")).exists()); + + // Commit: swap files and reload + v.commit_compact().unwrap(); + + // After compaction: 2 live needles, 0 deleted + assert_eq!(v.file_count(), 2); + assert_eq!(v.deleted_count(), 0); + + // Dat should be smaller (deleted needle removed) + let dat_size_after = v.dat_file_size().unwrap(); + assert!( + dat_size_after < dat_size_before, + "dat should shrink after compact" + ); + + // Read back live needles + let mut n1 = Needle { + id: NeedleId(1), + ..Needle::default() + }; + v.read_needle(&mut n1).unwrap(); + assert_eq!(n1.data, b"data-1"); + + let mut n3 = Needle { + id: NeedleId(3), + ..Needle::default() + }; + v.read_needle(&mut n3).unwrap(); + assert_eq!(n3.data, b"data-3"); + + // Needle 2 should not exist + let mut n2 = Needle { + id: NeedleId(2), + ..Needle::default() + }; + assert!(v.read_needle(&mut n2).is_err()); + + // Compact files should not exist after commit + assert!(!Path::new(&v.file_name(".cpd")).exists()); + assert!(!Path::new(&v.file_name(".cpx")).exists()); + + // Cleanup should be a no-op + v.cleanup_compact().unwrap(); + } + + #[test] + fn test_compaction_revision_relookup() { + // Verifies that re_lookup_needle_data_offset returns the correct data offset + // and compaction revision, and that after compaction the offset changes. + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write two needles + let mut n1 = Needle { + id: NeedleId(1), + cookie: Cookie(0xAABBCCDD), + data: b"first-needle-data".to_vec(), + data_size: 17, + ..Needle::default() + }; + v.write_needle(&mut n1, true).unwrap(); + + let mut n2 = Needle { + id: NeedleId(2), + cookie: Cookie(0x11223344), + data: b"second-needle-data".to_vec(), + data_size: 18, + ..Needle::default() + }; + v.write_needle(&mut n2, true).unwrap(); + + // Get initial revision and offset for needle 1 + let initial_rev = v.super_block.compaction_revision; + let (initial_offset, rev) = v.re_lookup_needle_data_offset(NeedleId(1)).unwrap(); + assert_eq!(rev, initial_rev); + assert!(initial_offset > 0, "data offset should be positive"); + + // Delete needle 2 so compaction removes it + let mut del_n2 = Needle { + id: NeedleId(2), + cookie: Cookie(0x11223344), + ..Needle::default() + }; + v.delete_needle(&mut del_n2).unwrap(); + + // Compact the volume — this increments compaction_revision and may move needles + v.compact_by_index(0, 0, |_| true).unwrap(); + v.commit_compact().unwrap(); + + // After compaction, the revision should have changed + let new_rev = v.super_block.compaction_revision; + assert_eq!( + new_rev, + initial_rev + 1, + "compaction should increment revision" + ); + + // Re-lookup needle 1 — should still be found with the new revision + let (new_offset, relookup_rev) = v.re_lookup_needle_data_offset(NeedleId(1)).unwrap(); + assert_eq!(relookup_rev, new_rev); + assert!(new_offset > 0, "data offset should still be positive"); + + // The data should still be readable correctly after compaction + let mut read_n1 = Needle { + id: NeedleId(1), + ..Needle::default() + }; + v.read_needle(&mut read_n1).unwrap(); + assert_eq!(read_n1.data, b"first-needle-data"); + + // Deleted needle should not be found + let result = v.re_lookup_needle_data_offset(NeedleId(2)); + assert!( + result.is_err(), + "deleted needle should not be found after compaction" + ); + } + + #[test] + fn test_stream_info_includes_compaction_revision() { + // Verifies that NeedleStreamInfo carries the volume's compaction revision + // so that StreamingBody can detect when compaction has occurred. + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write a needle large enough to have meaningful data + let data = vec![0xABu8; 2048]; + let mut n = Needle { + id: NeedleId(42), + cookie: Cookie(0xDEADBEEF), + data: data.clone(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Read stream info + let mut read_n = Needle { + id: NeedleId(42), + cookie: Cookie(0xDEADBEEF), + ..Needle::default() + }; + let info = v.read_needle_stream_info(&mut read_n, false).unwrap(); + + assert_eq!(info.volume_id, VolumeId(1)); + assert_eq!(info.needle_id, NeedleId(42)); + assert_eq!(info.compaction_revision, v.super_block.compaction_revision); + assert_eq!(info.data_size, data.len() as u32); + assert!(info.data_file_offset > 0); + } + + #[test] + fn test_remote_vif_load_blocks_writes_but_allows_delete() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let dat_size_before_reload = { + let mut v = make_test_volume(dir); + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0x1234), + data: b"remote".to_vec(), + data_size: 6, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + let vif = VifVolumeInfo { + files: vec![VifRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: v.dat_file_size().unwrap(), + modified_time: 123, + extension: ".dat".to_string(), + }], + version: v.version().0 as u32, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + v.dat_file_size().unwrap() + }; + + let mut v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert!(v.is_read_only()); + assert!(!v.no_write_or_delete); + assert!(v.no_write_can_delete); + + let err = v + .write_needle( + &mut Needle { + id: NeedleId(2), + cookie: Cookie(0x5678), + data: b"blocked".to_vec(), + data_size: 7, + ..Needle::default() + }, + true, + ) + .unwrap_err(); + assert!(matches!(err, VolumeError::ReadOnly)); + + let deleted_size = v + .delete_needle(&mut Needle { + id: NeedleId(1), + cookie: Cookie(0x1234), + ..Needle::default() + }) + .unwrap(); + assert!(deleted_size.0 > 0); + assert_eq!(v.dat_file_size().unwrap(), dat_size_before_reload); + } + + #[test] + fn test_set_writable_keeps_remote_delete_only_mode() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + v.volume_info.files.push(PbRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: v.dat_file_size().unwrap(), + modified_time: 123, + extension: ".dat".to_string(), + }); + v.refresh_remote_write_mode(); + v.set_writable().unwrap(); + + assert!(v.is_read_only()); + assert!(!v.no_write_or_delete); + assert!(v.no_write_can_delete); + } + + #[test] + fn test_load_vif_defaults_local_version_and_bytes_offset() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let _v = make_test_volume(dir); + let vif = VifVolumeInfo::default(); + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + } + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert_eq!(v.volume_info.version, Version::current().0 as u32); + assert_eq!(v.volume_info.bytes_offset, OFFSET_SIZE as u32); + assert_eq!(v.version(), Version::current()); + } + + #[test] + fn test_version_superblock_overrides_vif_version() { + // Go behavior: after reading the superblock, volumeInfo.Version is set + // to SuperBlock.Version, overriding whatever was in the .vif file. + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let _v = make_test_volume(dir); + // Write a .vif with version=2, but the .dat superblock is version=3 + let vif = VifVolumeInfo { + version: VERSION_2.0 as u32, + bytes_offset: OFFSET_SIZE as u32, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + } + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + // Superblock version (3) overrides the .vif version (2) + assert_eq!(v.volume_info.version, VERSION_3.0 as u32); + assert_eq!(v.version(), VERSION_3); + } + + #[test] + fn test_load_vif_rejects_bytes_offset_mismatch() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let _v = make_test_volume(dir); + let vif = VifVolumeInfo { + version: Version::current().0 as u32, + bytes_offset: (OFFSET_SIZE as u32) + 1, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + } + + let result = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ); + + match result { + Ok(_) => panic!("expected bytes_offset mismatch to fail"), + Err(VolumeError::Io(io_err)) => { + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("bytes_offset mismatch")); + } + Err(other) => panic!("unexpected error: {other:?}"), + } + } + + #[test] + fn test_remote_only_volume_load_reads_from_tier_backend() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let dat_bytes = { + let mut v = make_test_volume(dir); + let mut n = Needle { + id: NeedleId(7), + cookie: Cookie(0x7788), + data: b"remote-only".to_vec(), + data_size: 11, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + v.sync_to_disk().unwrap(); + std::fs::read(v.file_name(".dat")).unwrap() + }; + + let dat_path = format!("{}/1.dat", dir); + std::fs::remove_file(&dat_path).unwrap(); + + let (endpoint, shutdown_tx) = spawn_fake_s3_server(dat_bytes.clone()); + crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap() + .clear(); + let tier_config = crate::remote_storage::s3_tier::S3TierConfig { + access_key: "access".to_string(), + secret_key: "secret".to_string(), + region: "us-east-1".to_string(), + bucket: "bucket-a".to_string(), + endpoint, + storage_class: "STANDARD".to_string(), + force_path_style: true, + }; + { + let mut registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap(); + registry.register( + "s3.default".to_string(), + crate::remote_storage::s3_tier::S3TierBackend::new(&tier_config), + ); + registry.register( + "s3".to_string(), + crate::remote_storage::s3_tier::S3TierBackend::new(&tier_config), + ); + } + + let vif = VifVolumeInfo { + files: vec![VifRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: dat_bytes.len() as u64, + modified_time: 123, + extension: ".dat".to_string(), + }], + version: Version::current().0 as u32, + bytes_offset: OFFSET_SIZE as u32, + dat_file_size: dat_bytes.len() as i64, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert!(v.has_remote_file); + assert!(v.dat_file.is_none()); + assert!(v.remote_dat_file.is_some()); + + let mut n = Needle { + id: NeedleId(7), + ..Needle::default() + }; + let size = v.read_needle(&mut n).unwrap(); + assert_eq!(size, 11); + assert_eq!(n.data, b"remote-only"); + + let mut meta = Needle { + id: NeedleId(7), + ..Needle::default() + }; + let info = v.read_needle_stream_info(&mut meta, false).unwrap(); + assert!(matches!(info.source, NeedleStreamSource::Remote(_))); + let mut streamed = vec![0u8; info.data_size as usize]; + info.source + .read_exact_at(&mut streamed, info.data_file_offset) + .unwrap(); + assert_eq!(streamed, b"remote-only"); + assert_eq!(meta.data_size, 11); + + let _ = shutdown_tx.send(()); + } + + /// Volume destroy removes .vif alongside the primary data files. + #[test] + fn test_destroy_removes_vif() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut v = make_test_volume(dir); + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(1), + data: b"test".to_vec(), + data_size: 4, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write a .vif file (as EC encode would) + let vif_path = format!("{}/1.vif", dir); + std::fs::write(&vif_path, r#"{"version":3}"#).unwrap(); + assert!(std::path::Path::new(&vif_path).exists()); + + // .dat and .idx should exist + let dat_path = format!("{}/1.dat", dir); + let idx_path = format!("{}/1.idx", dir); + assert!(std::path::Path::new(&dat_path).exists()); + assert!(std::path::Path::new(&idx_path).exists()); + + // Destroy the volume + v.destroy(false).unwrap(); + + // .dat and .idx should be gone + assert!( + !std::path::Path::new(&dat_path).exists(), + ".dat should be removed" + ); + assert!( + !std::path::Path::new(&idx_path).exists(), + ".idx should be removed" + ); + + assert!( + !std::path::Path::new(&vif_path).exists(), + ".vif should be removed" + ); + } + + /// Volume destroy with separate idx directory must clean up both dirs and .vif. + #[test] + fn test_destroy_with_separate_idx_dir() { + let dat_tmp = TempDir::new().unwrap(); + let idx_tmp = TempDir::new().unwrap(); + let dat_dir = dat_tmp.path().to_str().unwrap(); + let idx_dir = idx_tmp.path().to_str().unwrap(); + + let mut v = Volume::new( + dat_dir, + idx_dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(1), + data: b"hello".to_vec(), + data_size: 5, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write .vif in data dir (as EC encode would) + let vif_path = format!("{}/1.vif", dat_dir); + std::fs::write(&vif_path, r#"{"version":3}"#).unwrap(); + + let dat_path = format!("{}/1.dat", dat_dir); + let idx_path = format!("{}/1.idx", idx_dir); + assert!(std::path::Path::new(&dat_path).exists()); + assert!(std::path::Path::new(&idx_path).exists()); + + v.destroy(false).unwrap(); + + assert!( + !std::path::Path::new(&dat_path).exists(), + ".dat removed from data dir" + ); + assert!( + !std::path::Path::new(&idx_path).exists(), + ".idx removed from idx dir" + ); + assert!( + !std::path::Path::new(&vif_path).exists(), + ".vif removed from data dir" + ); + } +} diff --git a/seaweed-volume/src/version.rs b/seaweed-volume/src/version.rs new file mode 100644 index 000000000..413a526b1 --- /dev/null +++ b/seaweed-volume/src/version.rs @@ -0,0 +1,79 @@ +//! Version helpers aligned with Go's util/version package. + +use std::sync::OnceLock; + +#[cfg(feature = "5bytes")] +const SIZE_LIMIT: &str = "8000GB"; // Matches Go production builds (5BytesOffset) +#[cfg(not(feature = "5bytes"))] +const SIZE_LIMIT: &str = "30GB"; // Matches Go default build (!5BytesOffset) + +pub fn size_limit() -> &'static str { + SIZE_LIMIT +} + +pub fn commit() -> &'static str { + option_env!("SEAWEEDFS_COMMIT") + .or(option_env!("GIT_COMMIT")) + .or(option_env!("GIT_SHA")) + .unwrap_or("") +} + +pub fn version_number() -> &'static str { + static VERSION_NUMBER: OnceLock = OnceLock::new(); + VERSION_NUMBER + .get_or_init(|| { + parse_go_version_number().unwrap_or_else(|| env!("CARGO_PKG_VERSION").to_string()) + }) + .as_str() +} + +pub fn version() -> &'static str { + static VERSION: OnceLock = OnceLock::new(); + VERSION + .get_or_init(|| format!("{} {}", size_limit(), version_number())) + .as_str() +} + +pub fn full_version() -> &'static str { + static FULL: OnceLock = OnceLock::new(); + FULL.get_or_init(|| format!("{} {}", version(), commit())) + .as_str() +} + +pub fn server_header() -> &'static str { + static HEADER: OnceLock = OnceLock::new(); + HEADER + .get_or_init(|| format!("SeaweedFS Volume {}", version())) + .as_str() +} + +fn parse_go_version_number() -> Option { + let src = include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/util/version/constants.go" + )); + let mut major: Option = None; + let mut minor: Option = None; + for line in src.lines() { + let l = line.trim(); + if l.starts_with("MAJOR_VERSION") { + major = parse_int32_line(l); + } else if l.starts_with("MINOR_VERSION") { + minor = parse_int32_line(l); + } + if major.is_some() && minor.is_some() { + break; + } + } + match (major, minor) { + (Some(maj), Some(min)) => Some(format!("{}.{}", maj, format!("{:02}", min))), + _ => None, + } +} + +fn parse_int32_line(line: &str) -> Option { + let start = line.find("int32(")? + "int32(".len(); + let rest = &line[start..]; + let end = rest.find(')')?; + rest[..end].trim().parse::().ok() +} diff --git a/seaweed-volume/tests/http_integration.rs b/seaweed-volume/tests/http_integration.rs new file mode 100644 index 000000000..c1a69248f --- /dev/null +++ b/seaweed-volume/tests/http_integration.rs @@ -0,0 +1,677 @@ +//! Integration tests for the volume server HTTP handlers. +//! +//! Uses axum's Router with tower::ServiceExt::oneshot to test +//! end-to-end without starting a real TCP server. + +use std::sync::{Arc, RwLock}; + +use axum::body::Body; +use axum::extract::connect_info::ConnectInfo; +use axum::http::{Request, StatusCode}; +use tower::ServiceExt; // for `oneshot` + +use seaweed_volume::security::{Guard, SigningKey}; +use seaweed_volume::server::volume_server::{ + build_admin_router, build_admin_router_with_ui, build_metrics_router, build_public_router, + VolumeServerState, +}; +use seaweed_volume::storage::needle_map::NeedleMapKind; +use seaweed_volume::storage::store::Store; +use seaweed_volume::storage::types::{DiskType, Version, VolumeId}; + +use tempfile::TempDir; + +/// Create a test VolumeServerState with a temp directory, a single disk +/// location, and one pre-created volume (VolumeId 1). +fn test_state() -> (Arc, TempDir) { + test_state_with_guard(Vec::new(), Vec::new()) +} + +fn test_state_with_signing_key(signing_key: Vec) -> (Arc, TempDir) { + test_state_with_guard(Vec::new(), signing_key) +} + +fn test_state_with_whitelist(whitelist: Vec) -> (Arc, TempDir) { + test_state_with_guard(whitelist, Vec::new()) +} + +fn test_state_with_guard( + whitelist: Vec, + signing_key: Vec, +) -> (Arc, TempDir) { + let tmp = TempDir::new().expect("failed to create temp dir"); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + seaweed_volume::config::MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .expect("failed to add location"); + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .expect("failed to create volume"); + + let guard = Guard::new( + &whitelist, + SigningKey(signing_key), + 0, + SigningKey(vec![]), + 0, + ); + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(guard), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + seaweed_volume::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: seaweed_volume::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new( + seaweed_volume::server::volume_server::RuntimeMetricsConfig::default(), + ), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: false, + read_buffer_size_bytes: 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }); + (state, tmp) +} + +/// Helper: read the entire response body as bytes. +async fn body_bytes(response: axum::response::Response) -> Vec { + let body = response.into_body(); + axum::body::to_bytes(body, usize::MAX) + .await + .expect("failed to read body") + .to_vec() +} + +fn with_remote_addr(request: Request, remote_addr: &str) -> Request { + let mut request = request; + let remote_addr = remote_addr + .parse::() + .expect("invalid socket address"); + request.extensions_mut().insert(ConnectInfo(remote_addr)); + request +} + +// ============================================================================ +// 1. GET /healthz returns 200 when server is running +// ============================================================================ + +#[tokio::test] +async fn healthz_returns_200_when_running() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/healthz") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); +} + +// ============================================================================ +// 2. GET /healthz returns 503 when is_stopping=true +// ============================================================================ + +#[tokio::test] +async fn healthz_returns_503_when_stopping() { + let (state, _tmp) = test_state(); + *state.is_stopping.write().unwrap() = true; + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/healthz") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE); +} + +// ============================================================================ +// 3. GET /status returns JSON with version and volumes array +// ============================================================================ + +#[tokio::test] +async fn status_returns_json_with_version_and_volumes() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/status") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + + let body = body_bytes(response).await; + let json: serde_json::Value = + serde_json::from_slice(&body).expect("response is not valid JSON"); + + assert!(json.get("Version").is_some(), "missing 'Version' field"); + assert!(json["Version"].is_string(), "'Version' should be a string"); + + assert!(json.get("Volumes").is_some(), "missing 'Volumes' field"); + assert!(json["Volumes"].is_array(), "'Volumes' should be an array"); + + // We created one volume in test_state, so the array should have one entry + let volumes = json["Volumes"].as_array().unwrap(); + assert_eq!(volumes.len(), 1, "expected 1 volume"); + assert_eq!(volumes[0]["Id"], 1); +} + +#[tokio::test] +async fn admin_router_does_not_expose_metrics() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/metrics") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn metrics_router_serves_metrics() { + let app = build_metrics_router(); + + let response = app + .oneshot( + Request::builder() + .uri("/metrics") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn admin_router_rejects_non_whitelisted_uploads() { + let (state, _tmp) = test_state_with_whitelist(vec!["127.0.0.1".to_string()]); + let app = build_admin_router(state); + + let response = app + .oneshot(with_remote_addr( + Request::builder() + .method("POST") + .uri("/1,000000000000000001") + .body(Body::from("blocked")) + .unwrap(), + "10.0.0.9:12345", + )) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); +} + +#[tokio::test] +async fn admin_router_rejects_non_whitelisted_deletes() { + let (state, _tmp) = test_state_with_whitelist(vec!["127.0.0.1".to_string()]); + let app = build_admin_router(state); + + let response = app + .oneshot(with_remote_addr( + Request::builder() + .method("DELETE") + .uri("/1,000000000000000001") + .body(Body::empty()) + .unwrap(), + "10.0.0.9:12345", + )) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); +} + +// Go's volume_server.go has /stats/* endpoints commented out (L130-134). +// Requests to /stats/counter fall through to the store handler which returns 400. +#[tokio::test] +async fn admin_router_does_not_expose_stats_routes() { + let (state, _tmp) = test_state_with_whitelist(vec!["127.0.0.1".to_string()]); + let app = build_admin_router_with_ui(state, true); + + let response = app + .oneshot(with_remote_addr( + Request::builder() + .uri("/stats/counter") + .body(Body::empty()) + .unwrap(), + "127.0.0.1:12345", + )) + .await + .unwrap(); + + // Falls through to store handler → 400 (bad volume id) + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ============================================================================ +// 4. POST writes data, then GET reads it back +// ============================================================================ + +#[tokio::test] +async fn write_then_read_needle() { + let (state, _tmp) = test_state(); + + // The fid "01637037d6" encodes NeedleId=0x01, Cookie=0x637037d6 + let uri = "/1,01637037d6"; + let payload = b"hello, seaweedfs!"; + + // --- POST (write) --- + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .body(Body::from(payload.to_vec())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!( + response.status(), + StatusCode::CREATED, + "POST should return 201 Created" + ); + + let body = body_bytes(response).await; + let json: serde_json::Value = + serde_json::from_slice(&body).expect("POST response is not valid JSON"); + assert_eq!(json["size"], payload.len() as u64); + + // --- GET (read back) --- + let app = build_admin_router(state.clone()); + let response = app + .oneshot(Request::builder().uri(uri).body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK, "GET should return 200"); + + let body = body_bytes(response).await; + assert_eq!(body, payload, "GET body should match written data"); +} + +// ============================================================================ +// 5. DELETE deletes a needle, subsequent GET returns 404 +// ============================================================================ + +#[tokio::test] +async fn delete_then_get_returns_404() { + let (state, _tmp) = test_state(); + let uri = "/1,01637037d6"; + let payload = b"to be deleted"; + + // Write the needle first + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .body(Body::from(payload.to_vec())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::CREATED); + + // Delete + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("DELETE") + .uri(uri) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + response.status(), + StatusCode::ACCEPTED, + "DELETE should return 202 Accepted" + ); + + // GET should now return 404 + let app = build_admin_router(state.clone()); + let response = app + .oneshot(Request::builder().uri(uri).body(Body::empty()).unwrap()) + .await + .unwrap(); + assert_eq!( + response.status(), + StatusCode::NOT_FOUND, + "GET after DELETE should return 404" + ); +} + +// ============================================================================ +// 6. HEAD returns headers without body +// ============================================================================ + +#[tokio::test] +async fn head_returns_headers_without_body() { + let (state, _tmp) = test_state(); + let uri = "/1,01637037d6"; + let payload = b"head test data"; + + // Write needle + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .body(Body::from(payload.to_vec())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::CREATED); + + // HEAD + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("HEAD") + .uri(uri) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK, "HEAD should return 200"); + + // Content-Length header should be present + let content_length = response + .headers() + .get("content-length") + .expect("HEAD should include Content-Length header"); + let len: usize = content_length + .to_str() + .unwrap() + .parse() + .expect("Content-Length should be a number"); + assert_eq!( + len, + payload.len(), + "Content-Length should match payload size" + ); + + // Body should be empty for HEAD + let body = body_bytes(response).await; + assert!(body.is_empty(), "HEAD body should be empty"); +} + +// ============================================================================ +// 7. Invalid URL path returns 400 +// ============================================================================ + +#[tokio::test] +async fn invalid_url_path_returns_400() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + // "invalidpath" has no comma or slash separator so parse_url_path returns None + let response = app + .oneshot( + Request::builder() + .uri("/invalidpath") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!( + response.status(), + StatusCode::BAD_REQUEST, + "invalid URL path should return 400" + ); +} + +#[tokio::test] +async fn deep_invalid_url_path_returns_400() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/not/a/valid/volume/path") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn admin_root_get_returns_400() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot(Request::builder().uri("/").body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn public_root_get_returns_400() { + let (state, _tmp) = test_state(); + let app = build_public_router(state); + + let response = app + .oneshot(Request::builder().uri("/").body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn public_router_does_not_expose_healthz() { + let (state, _tmp) = test_state(); + let app = build_public_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/healthz") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// Go's volume_server.go has /stats/* endpoints commented out (L130-134). +#[tokio::test] +async fn admin_router_stats_routes_not_registered() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/stats/counter") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + // Falls through to store handler → 400 (bad volume id) + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn admin_router_hides_ui_when_write_jwt_is_configured() { + let (state, _tmp) = test_state_with_signing_key(b"secret".to_vec()); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/ui/index.html") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn admin_router_can_expose_ui_with_explicit_override() { + let (state, _tmp) = test_state_with_signing_key(b"secret".to_vec()); + let app = build_admin_router_with_ui(state, true); + + let response = app + .oneshot( + Request::builder() + .uri("/ui/index.html") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + let body = body_bytes(response).await; + let html = String::from_utf8(body).unwrap(); + assert!(html.contains("Disk Stats")); + assert!(html.contains("System Stats")); + assert!(html.contains("Volumes")); +} + +#[tokio::test] +async fn admin_router_ui_override_ignores_read_jwt_checks() { + let (state, _tmp) = test_state_with_signing_key(b"write-secret".to_vec()); + state.guard.write().unwrap().read_signing_key = SigningKey(b"read-secret".to_vec()); + let app = build_admin_router_with_ui(state, true); + + let response = app + .oneshot( + Request::builder() + .uri("/ui/index.html") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn admin_router_serves_volume_ui_static_assets() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/seaweedfsstatic/bootstrap/3.3.1/css/bootstrap.min.css") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + assert_eq!( + response + .headers() + .get("content-type") + .and_then(|value| value.to_str().ok()), + Some("text/css; charset=utf-8") + ); + let body = body_bytes(response).await; + assert!(body.len() > 1000); +} diff --git a/seaweed-volume/tools/generate_go_volume_docs.go b/seaweed-volume/tools/generate_go_volume_docs.go new file mode 100644 index 000000000..cdc1cb7dc --- /dev/null +++ b/seaweed-volume/tools/generate_go_volume_docs.go @@ -0,0 +1,1172 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "go/ast" + "go/parser" + "go/printer" + "go/token" + "io/fs" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +type FileDoc struct { + RelPath string + PackageName string + AbsPath string + LineCount int + Imports []string + TopLevelDecls []DeclInfo + Functions []*FunctionInfo + RustCounterpart []string +} + +type DeclInfo struct { + Kind string + Names []string + StartLine int + EndLine int + Summary string + Details []string +} + +type FunctionInfo struct { + ID string + PackageName string + FileRelPath string + Name string + Receiver string + ReceiverType string + Signature string + DocComment string + StartLine int + EndLine int + Effect string + CallNames []string + CallDisplay []string + PotentialLocal []string + ExternalCalls []string + PossibleCallers []string + ControlFlow []string + Literals []LiteralInfo + Statements []StmtInfo + SourceLines []string +} + +type LiteralInfo struct { + Line int + Value string + Kind string +} + +type StmtInfo struct { + StartLine int + EndLine int + Kind string + Summary string +} + +type funcIndex struct { + ByPackage map[string]map[string][]string + ByName map[string][]string + Defs map[string]*FunctionInfo +} + +func main() { + rootFlag := flag.String("root", ".", "repository root") + outFlag := flag.String("out", "seaweed-volume/docs/go-volume-server", "output directory for generated markdown") + flag.Parse() + + root, err := filepath.Abs(*rootFlag) + if err != nil { + fail("resolve root", err) + } + outDir := filepath.Join(root, *outFlag) + + paths, err := collectSourceFiles(root) + if err != nil { + fail("collect source files", err) + } + + docs, idx, err := parseFiles(root, paths) + if err != nil { + fail("parse source files", err) + } + + linkCallers(idx) + + if err := os.RemoveAll(outDir); err != nil { + fail("clear output directory", err) + } + if err := os.MkdirAll(outDir, 0o755); err != nil { + fail("create output directory", err) + } + + for _, doc := range docs { + target := filepath.Join(outDir, filepath.FromSlash(doc.RelPath+".md")) + if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil { + fail("create doc parent", err) + } + content := renderFileDoc(doc) + if err := os.WriteFile(target, []byte(content), 0o644); err != nil { + fail("write doc file", err) + } + } + + readme := renderIndexReadme(*outFlag, docs) + if err := os.WriteFile(filepath.Join(outDir, "README.md"), []byte(readme), 0o644); err != nil { + fail("write index", err) + } + + fmt.Printf("Generated %d Markdown files under %s\n", len(docs)+1, outDir) +} + +func fail(action string, err error) { + fmt.Fprintf(os.Stderr, "%s: %v\n", action, err) + os.Exit(1) +} + +func collectSourceFiles(root string) ([]string, error) { + var files []string + err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + base := filepath.Base(path) + if base == ".git" || base == "target" || base == "vendor" { + return filepath.SkipDir + } + return nil + } + if filepath.Ext(path) != ".go" || strings.HasSuffix(path, "_test.go") { + return nil + } + rel, err := filepath.Rel(root, path) + if err != nil { + return err + } + rel = filepath.ToSlash(rel) + if shouldInclude(rel) { + files = append(files, rel) + } + return nil + }) + sort.Strings(files) + return files, err +} + +func shouldInclude(rel string) bool { + switch { + case rel == "weed/command/volume.go": + return true + case rel == "weed/server/common.go": + return true + case rel == "weed/server/constants/volume.go": + return true + case rel == "weed/server/volume_server_ui/templates.go": + return true + case strings.HasPrefix(rel, "weed/server/volume") && strings.HasSuffix(rel, ".go"): + return true + case strings.HasPrefix(rel, "weed/storage/"): + return true + case strings.HasPrefix(rel, "weed/images/"): + return true + case strings.HasPrefix(rel, "weed/security/"): + return true + case strings.HasPrefix(rel, "weed/stats/"): + return true + default: + return false + } +} + +func parseFiles(root string, relPaths []string) ([]*FileDoc, *funcIndex, error) { + fset := token.NewFileSet() + var docs []*FileDoc + index := &funcIndex{ + ByPackage: map[string]map[string][]string{}, + ByName: map[string][]string{}, + Defs: map[string]*FunctionInfo{}, + } + + for _, rel := range relPaths { + abs := filepath.Join(root, filepath.FromSlash(rel)) + src, err := os.ReadFile(abs) + if err != nil { + return nil, nil, err + } + fileAst, err := parser.ParseFile(fset, abs, src, parser.ParseComments) + if err != nil { + return nil, nil, err + } + + lines := splitLines(string(src)) + doc := &FileDoc{ + RelPath: rel, + PackageName: fileAst.Name.Name, + AbsPath: abs, + LineCount: len(lines), + Imports: collectImports(fileAst), + TopLevelDecls: collectDecls(fset, fileAst), + RustCounterpart: rustCounterparts(rel), + } + + for _, decl := range fileAst.Decls { + funcDecl, ok := decl.(*ast.FuncDecl) + if !ok { + continue + } + info := collectFunctionInfo(fset, rel, fileAst.Name.Name, funcDecl, lines) + doc.Functions = append(doc.Functions, info) + index.Defs[info.ID] = info + + if index.ByPackage[info.PackageName] == nil { + index.ByPackage[info.PackageName] = map[string][]string{} + } + index.ByPackage[info.PackageName][info.Name] = append(index.ByPackage[info.PackageName][info.Name], info.ID) + index.ByName[info.Name] = append(index.ByName[info.Name], info.ID) + } + + sort.Slice(doc.Functions, func(i, j int) bool { + return doc.Functions[i].StartLine < doc.Functions[j].StartLine + }) + docs = append(docs, doc) + } + + sort.Slice(docs, func(i, j int) bool { return docs[i].RelPath < docs[j].RelPath }) + for _, ids := range index.ByName { + sort.Strings(ids) + } + for _, byName := range index.ByPackage { + for _, ids := range byName { + sort.Strings(ids) + } + } + return docs, index, nil +} + +func collectImports(fileAst *ast.File) []string { + var imports []string + for _, imp := range fileAst.Imports { + path := strings.Trim(imp.Path.Value, "\"") + if imp.Name != nil { + imports = append(imports, imp.Name.Name+" "+path) + } else { + imports = append(imports, path) + } + } + sort.Strings(imports) + return imports +} + +func collectDecls(fset *token.FileSet, fileAst *ast.File) []DeclInfo { + var decls []DeclInfo + for _, decl := range fileAst.Decls { + genDecl, ok := decl.(*ast.GenDecl) + if !ok { + continue + } + info := DeclInfo{ + Kind: strings.ToLower(genDecl.Tok.String()), + StartLine: fset.Position(genDecl.Pos()).Line, + EndLine: fset.Position(genDecl.End()).Line, + } + for _, spec := range genDecl.Specs { + switch s := spec.(type) { + case *ast.TypeSpec: + info.Names = append(info.Names, s.Name.Name) + info.Details = append(info.Details, summarizeTypeSpec(fset, s)...) + case *ast.ValueSpec: + for i, name := range s.Names { + info.Names = append(info.Names, name.Name) + value := "" + if i < len(s.Values) { + value = nodeString(fset, s.Values[i]) + } + switch { + case value != "": + info.Details = append(info.Details, fmt.Sprintf("L%d `%s` = `%s`", fset.Position(name.Pos()).Line, name.Name, sanitizeInline(value))) + case s.Type != nil: + info.Details = append(info.Details, fmt.Sprintf("L%d `%s` has declared type `%s`", fset.Position(name.Pos()).Line, name.Name, sanitizeInline(nodeString(fset, s.Type)))) + default: + info.Details = append(info.Details, fmt.Sprintf("L%d `%s` is declared without an inline initializer", fset.Position(name.Pos()).Line, name.Name)) + } + } + } + } + if len(info.Names) == 0 { + info.Names = []string{""} + } + info.Summary = fmt.Sprintf("%s declaration covering %s", info.Kind, strings.Join(info.Names, ", ")) + decls = append(decls, info) + } + return decls +} + +func summarizeTypeSpec(fset *token.FileSet, spec *ast.TypeSpec) []string { + var details []string + switch t := spec.Type.(type) { + case *ast.StructType: + for _, field := range t.Fields.List { + names := []string{""} + if len(field.Names) > 0 { + names = nil + for _, name := range field.Names { + names = append(names, name.Name) + } + } + line := fset.Position(field.Pos()).Line + msg := fmt.Sprintf("L%d fields `%s` have type `%s`", line, strings.Join(names, "`, `"), sanitizeInline(nodeString(fset, field.Type))) + if field.Tag != nil { + msg += fmt.Sprintf(" with tag `%s`", sanitizeInline(field.Tag.Value)) + } + details = append(details, msg) + } + case *ast.InterfaceType: + for _, field := range t.Methods.List { + names := []string{""} + if len(field.Names) > 0 { + names = nil + for _, name := range field.Names { + names = append(names, name.Name) + } + } + details = append(details, fmt.Sprintf("L%d interface item `%s` has type `%s`", fset.Position(field.Pos()).Line, strings.Join(names, "`, `"), sanitizeInline(nodeString(fset, field.Type)))) + } + default: + details = append(details, fmt.Sprintf("L%d `%s` resolves to `%s`", fset.Position(spec.Pos()).Line, spec.Name.Name, sanitizeInline(nodeString(fset, spec.Type)))) + } + return details +} + +func collectFunctionInfo(fset *token.FileSet, relPath, pkgName string, decl *ast.FuncDecl, lines []string) *FunctionInfo { + startLine := fset.Position(decl.Pos()).Line + endLine := fset.Position(decl.End()).Line + if endLine > len(lines) { + endLine = len(lines) + } + sourceLines := make([]string, 0, endLine-startLine+1) + for i := startLine; i <= endLine; i++ { + sourceLines = append(sourceLines, lines[i-1]) + } + + info := &FunctionInfo{ + PackageName: pkgName, + FileRelPath: relPath, + Name: decl.Name.Name, + StartLine: startLine, + EndLine: endLine, + DocComment: cleanDocComment(decl.Doc), + Signature: buildSignature(fset, decl), + SourceLines: sourceLines, + } + if decl.Recv != nil && len(decl.Recv.List) > 0 { + field := decl.Recv.List[0] + info.ReceiverType = nodeString(fset, field.Type) + if len(field.Names) > 0 { + info.Receiver = field.Names[0].Name + } + info.ID = pkgName + "::" + normalizeReceiverType(info.ReceiverType) + "." + info.Name + } else { + info.ID = pkgName + "::" + info.Name + } + + if decl.Body != nil { + callNames, callDisplay := collectCalls(fset, decl.Body) + info.CallNames = callNames + info.CallDisplay = callDisplay + info.ControlFlow = collectControlFlow(fset, decl.Body) + info.Literals = collectLiterals(fset, decl.Body) + info.Statements = collectStatements(fset, decl.Body) + } + + info.Effect = deriveEffect(info, decl) + return info +} + +func buildSignature(fset *token.FileSet, decl *ast.FuncDecl) string { + typeText := sanitizeInline(strings.TrimSpace(nodeString(fset, decl.Type))) + typeText = strings.TrimPrefix(typeText, "func") + typeText = strings.TrimSpace(typeText) + if decl.Recv != nil { + return fmt.Sprintf("func (%s) %s%s", sanitizeInline(fieldListString(fset, decl.Recv)), decl.Name.Name, typeText) + } + return fmt.Sprintf("func %s%s", decl.Name.Name, typeText) +} + +func collectCalls(fset *token.FileSet, body *ast.BlockStmt) ([]string, []string) { + nameSet := map[string]struct{}{} + displaySet := map[string]struct{}{} + ast.Inspect(body, func(n ast.Node) bool { + call, ok := n.(*ast.CallExpr) + if !ok { + return true + } + name := simpleCallName(call.Fun) + if name != "" { + nameSet[name] = struct{}{} + } + display := sanitizeInline(nodeString(fset, call.Fun)) + if display != "" { + displaySet[display] = struct{}{} + } + return true + }) + return sortedKeys(nameSet), sortedKeys(displaySet) +} + +func collectControlFlow(fset *token.FileSet, body *ast.BlockStmt) []string { + var items []string + ast.Inspect(body, func(n ast.Node) bool { + switch s := n.(type) { + case *ast.IfStmt: + msg := fmt.Sprintf("L%d branches when `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Cond))) + if s.Init != nil { + msg += fmt.Sprintf(" after `%s`", sanitizeInline(nodeString(fset, s.Init))) + } + items = append(items, msg) + case *ast.ForStmt: + cond := "forever" + if s.Cond != nil { + cond = sanitizeInline(nodeString(fset, s.Cond)) + } + items = append(items, fmt.Sprintf("L%d loops while `%s`", fset.Position(s.Pos()).Line, cond)) + case *ast.RangeStmt: + items = append(items, fmt.Sprintf("L%d ranges `%s` over `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Key)), sanitizeInline(nodeString(fset, s.X)))) + case *ast.SwitchStmt: + tag := "" + if s.Tag != nil { + tag = sanitizeInline(nodeString(fset, s.Tag)) + } + items = append(items, fmt.Sprintf("L%d switches on `%s`", fset.Position(s.Pos()).Line, tag)) + case *ast.TypeSwitchStmt: + items = append(items, fmt.Sprintf("L%d performs a type switch on `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Assign)))) + case *ast.SelectStmt: + items = append(items, fmt.Sprintf("L%d selects across channel cases", fset.Position(s.Pos()).Line)) + case *ast.DeferStmt: + items = append(items, fmt.Sprintf("L%d defers `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Call)))) + case *ast.GoStmt: + items = append(items, fmt.Sprintf("L%d launches goroutine `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Call)))) + case *ast.ReturnStmt: + items = append(items, fmt.Sprintf("L%d returns `%s`", fset.Position(s.Pos()).Line, sanitizeInline(joinNodes(fset, s.Results)))) + } + return true + }) + return dedupeKeepOrder(items) +} + +func collectLiterals(fset *token.FileSet, body *ast.BlockStmt) []LiteralInfo { + var literals []LiteralInfo + seen := map[string]struct{}{} + ast.Inspect(body, func(n ast.Node) bool { + switch lit := n.(type) { + case *ast.BasicLit: + item := LiteralInfo{ + Line: fset.Position(lit.Pos()).Line, + Value: lit.Value, + Kind: lit.Kind.String(), + } + key := fmt.Sprintf("%d|%s|%s", item.Line, item.Kind, item.Value) + if _, ok := seen[key]; !ok { + literals = append(literals, item) + seen[key] = struct{}{} + } + case *ast.Ident: + if lit.Name != "true" && lit.Name != "false" && lit.Name != "nil" { + return true + } + item := LiteralInfo{ + Line: fset.Position(lit.Pos()).Line, + Value: lit.Name, + Kind: "keyword", + } + key := fmt.Sprintf("%d|%s|%s", item.Line, item.Kind, item.Value) + if _, ok := seen[key]; !ok { + literals = append(literals, item) + seen[key] = struct{}{} + } + } + return true + }) + sort.Slice(literals, func(i, j int) bool { + if literals[i].Line == literals[j].Line { + if literals[i].Kind == literals[j].Kind { + return literals[i].Value < literals[j].Value + } + return literals[i].Kind < literals[j].Kind + } + return literals[i].Line < literals[j].Line + }) + return literals +} + +func collectStatements(fset *token.FileSet, body *ast.BlockStmt) []StmtInfo { + var items []StmtInfo + var walkBlock func([]ast.Stmt) + walkBlock = func(stmts []ast.Stmt) { + for _, stmt := range stmts { + info := summarizeStmt(fset, stmt) + if info.Kind != "" { + items = append(items, info) + } + switch s := stmt.(type) { + case *ast.BlockStmt: + walkBlock(s.List) + case *ast.IfStmt: + walkBlock(s.Body.List) + switch elseNode := s.Else.(type) { + case *ast.BlockStmt: + walkBlock(elseNode.List) + case *ast.IfStmt: + walkBlock([]ast.Stmt{elseNode}) + } + case *ast.ForStmt: + walkBlock(s.Body.List) + case *ast.RangeStmt: + walkBlock(s.Body.List) + case *ast.SwitchStmt: + for _, stmt := range s.Body.List { + if clause, ok := stmt.(*ast.CaseClause); ok { + items = append(items, summarizeStmt(fset, clause)) + walkBlock(clause.Body) + } + } + case *ast.TypeSwitchStmt: + for _, stmt := range s.Body.List { + if clause, ok := stmt.(*ast.CaseClause); ok { + items = append(items, summarizeStmt(fset, clause)) + walkBlock(clause.Body) + } + } + case *ast.SelectStmt: + for _, stmt := range s.Body.List { + if clause, ok := stmt.(*ast.CommClause); ok { + items = append(items, summarizeStmt(fset, clause)) + walkBlock(clause.Body) + } + } + case *ast.LabeledStmt: + walkBlock([]ast.Stmt{s.Stmt}) + } + } + } + walkBlock(body.List) + sort.Slice(items, func(i, j int) bool { + if items[i].StartLine == items[j].StartLine { + if items[i].EndLine == items[j].EndLine { + return items[i].Summary < items[j].Summary + } + return items[i].EndLine < items[j].EndLine + } + return items[i].StartLine < items[j].StartLine + }) + return items +} + +func summarizeStmt(fset *token.FileSet, stmt ast.Stmt) StmtInfo { + info := StmtInfo{ + StartLine: fset.Position(stmt.Pos()).Line, + EndLine: fset.Position(stmt.End()).Line, + } + switch s := stmt.(type) { + case *ast.AssignStmt: + info.Kind = "assign" + lhs := sanitizeInline(joinNodes(fset, s.Lhs)) + rhs := sanitizeInline(joinNodes(fset, s.Rhs)) + info.Summary = fmt.Sprintf("assigns `%s` %s `%s`", lhs, s.Tok.String(), rhs) + case *ast.ExprStmt: + info.Kind = "expr" + info.Summary = fmt.Sprintf("executes `%s`", sanitizeInline(nodeString(fset, s.X))) + case *ast.IfStmt: + info.Kind = "if" + info.Summary = fmt.Sprintf("checks `%s`", sanitizeInline(nodeString(fset, s.Cond))) + case *ast.ForStmt: + info.Kind = "for" + cond := "true" + if s.Cond != nil { + cond = sanitizeInline(nodeString(fset, s.Cond)) + } + info.Summary = fmt.Sprintf("loops while `%s`", cond) + case *ast.RangeStmt: + info.Kind = "range" + target := sanitizeInline(nodeString(fset, s.X)) + left := sanitizeInline(joinNodes(fset, []ast.Expr{exprOrBlank(s.Key), exprOrBlank(s.Value)})) + info.Summary = fmt.Sprintf("ranges `%s` over `%s`", left, target) + case *ast.ReturnStmt: + info.Kind = "return" + info.Summary = fmt.Sprintf("returns `%s`", sanitizeInline(joinNodes(fset, s.Results))) + case *ast.DeferStmt: + info.Kind = "defer" + info.Summary = fmt.Sprintf("defers `%s`", sanitizeInline(nodeString(fset, s.Call))) + case *ast.GoStmt: + info.Kind = "go" + info.Summary = fmt.Sprintf("launches goroutine `%s`", sanitizeInline(nodeString(fset, s.Call))) + case *ast.SwitchStmt: + info.Kind = "switch" + tag := "true" + if s.Tag != nil { + tag = sanitizeInline(nodeString(fset, s.Tag)) + } + info.Summary = fmt.Sprintf("switches on `%s`", tag) + case *ast.TypeSwitchStmt: + info.Kind = "type-switch" + info.Summary = fmt.Sprintf("type-switches on `%s`", sanitizeInline(nodeString(fset, s.Assign))) + case *ast.SelectStmt: + info.Kind = "select" + info.Summary = "selects across channel operations" + case *ast.CaseClause: + info.Kind = "case" + if len(s.List) == 0 { + info.Summary = "default case" + } else { + info.Summary = fmt.Sprintf("case `%s`", sanitizeInline(joinNodes(fset, s.List))) + } + case *ast.CommClause: + info.Kind = "comm" + if s.Comm == nil { + info.Summary = "default communication case" + } else { + info.Summary = fmt.Sprintf("communication case `%s`", sanitizeInline(nodeString(fset, s.Comm))) + } + case *ast.BranchStmt: + info.Kind = "branch" + if s.Label != nil { + info.Summary = fmt.Sprintf("%s to label `%s`", strings.ToLower(s.Tok.String()), s.Label.Name) + } else { + info.Summary = strings.ToLower(s.Tok.String()) + } + case *ast.SendStmt: + info.Kind = "send" + info.Summary = fmt.Sprintf("sends `%s` to `%s`", sanitizeInline(nodeString(fset, s.Value)), sanitizeInline(nodeString(fset, s.Chan))) + case *ast.IncDecStmt: + info.Kind = "incdec" + info.Summary = fmt.Sprintf("%s `%s`", strings.ToLower(s.Tok.String()), sanitizeInline(nodeString(fset, s.X))) + case *ast.DeclStmt: + info.Kind = "decl" + info.Summary = fmt.Sprintf("declares `%s`", sanitizeInline(nodeString(fset, s.Decl))) + case *ast.LabeledStmt: + info.Kind = "label" + info.Summary = fmt.Sprintf("label `%s`", s.Label.Name) + default: + info.Kind = strings.ToLower(strings.TrimSuffix(strings.TrimPrefix(fmt.Sprintf("%T", stmt), "*ast."), "Stmt")) + info.Summary = sanitizeInline(nodeString(fset, stmt)) + } + return info +} + +func exprOrBlank(expr ast.Expr) ast.Expr { + if expr == nil { + return &ast.Ident{Name: "_"} + } + return expr +} + +func deriveEffect(info *FunctionInfo, decl *ast.FuncDecl) string { + if info.DocComment != "" { + return info.DocComment + } + name := info.Name + switch { + case strings.HasPrefix(name, "New"): + return fmt.Sprintf("Constructs and returns `%s`-related state.", strings.TrimPrefix(name, "New")) + case strings.HasPrefix(name, "Get"): + return "Retrieves or serves the requested resource and returns the outcome." + case strings.HasPrefix(name, "Read"): + return "Reads storage or request data and converts it into the function's return or streamed response." + case strings.HasPrefix(name, "Write"): + return "Writes state, file data, or response output and reports the result." + case strings.HasPrefix(name, "Delete"): + return "Deletes the targeted state or storage entries and returns status." + case strings.HasPrefix(name, "Update"): + return "Updates existing state in place, usually based on request or runtime conditions." + case strings.HasPrefix(name, "Load"): + return "Loads persisted state or configuration into runtime structures." + case strings.HasPrefix(name, "Save"): + return "Persists runtime state or derived data." + case strings.HasPrefix(name, "parse"), strings.HasPrefix(name, "Parse"): + return "Parses inbound text or binary input into structured values." + case strings.HasSuffix(name, "Handler"): + return "Handles an HTTP endpoint and writes the response side effects directly." + case strings.Contains(name, "Heartbeat"): + return "Maintains master/volume heartbeat state and its side effects." + case strings.Contains(name, "Vacuum"): + return "Runs or coordinates vacuum/compaction related work." + case strings.Contains(name, "Copy"): + return "Copies data between storage locations or peer volume servers." + case strings.Contains(name, "Scrub"): + return "Validates stored data and surfaces corruption or mismatch details." + case strings.Contains(name, "Mount"): + return "Attaches runtime-visible storage or shard state." + case strings.Contains(name, "Unmount"): + return "Detaches runtime-visible storage or shard state." + case strings.Contains(name, "Needle"): + return "Manipulates or transports SeaweedFS needle state." + default: + if info.ReceiverType != "" { + return fmt.Sprintf("Implements `%s` behavior on receiver `%s`.", name, sanitizeInline(info.ReceiverType)) + } + return fmt.Sprintf("Implements `%s` for package `%s`.", name, info.PackageName) + } +} + +func linkCallers(idx *funcIndex) { + for _, fn := range idx.Defs { + var local []string + var external []string + for _, display := range fn.CallDisplay { + simple := simpleNameFromDisplay(display) + if ids, ok := idx.ByPackage[fn.PackageName][simple]; ok && len(ids) > 0 { + local = append(local, display) + } else { + external = append(external, display) + } + } + fn.PotentialLocal = dedupeKeepOrder(local) + fn.ExternalCalls = dedupeKeepOrder(external) + + var callers []string + if ids, ok := idx.ByName[fn.Name]; ok { + for _, candidateID := range ids { + if candidateID == fn.ID { + continue + } + candidate := idx.Defs[candidateID] + for _, callName := range candidate.CallNames { + if callName == fn.Name { + callers = append(callers, candidateID) + break + } + } + } + } + sort.Strings(callers) + fn.PossibleCallers = callers + } +} + +func renderFileDoc(doc *FileDoc) string { + var b strings.Builder + b.WriteString("# " + doc.RelPath + "\n\n") + b.WriteString("- Source file: `" + doc.RelPath + "`\n") + b.WriteString("- Package: `" + doc.PackageName + "`\n") + b.WriteString(fmt.Sprintf("- Total lines: `%d`\n", doc.LineCount)) + if len(doc.RustCounterpart) > 0 { + b.WriteString("- Rust counterpart candidates: `" + strings.Join(doc.RustCounterpart, "`, `") + "`\n") + } else { + b.WriteString("- Rust counterpart candidates: none mapped directly; behavior may still be folded into adjacent Rust modules.\n") + } + b.WriteString("\n## Imports\n\n") + if len(doc.Imports) == 0 { + b.WriteString("This file has no imports.\n") + } else { + for _, imp := range doc.Imports { + b.WriteString("- `" + imp + "`\n") + } + } + + b.WriteString("\n## Top-Level Declarations\n\n") + if len(doc.TopLevelDecls) == 0 { + b.WriteString("No package-level const/var/type declarations in this file.\n") + } else { + for _, decl := range doc.TopLevelDecls { + b.WriteString(fmt.Sprintf("### `%s` `%s`\n\n", decl.Kind, strings.Join(decl.Names, "`, `"))) + b.WriteString(fmt.Sprintf("- Lines: `%d-%d`\n", decl.StartLine, decl.EndLine)) + b.WriteString("- Role: " + decl.Summary + "\n") + if len(decl.Details) > 0 { + b.WriteString("- Details:\n") + for _, detail := range decl.Details { + b.WriteString(" - " + detail + "\n") + } + } + b.WriteString("\n") + } + } + + b.WriteString("## Function Inventory\n\n") + if len(doc.Functions) == 0 { + b.WriteString("No functions or methods are declared in this file.\n") + return b.String() + } + for _, fn := range doc.Functions { + receiver := "" + if fn.ReceiverType != "" { + receiver = " receiver `" + sanitizeInline(fn.ReceiverType) + "`" + } + b.WriteString(fmt.Sprintf("- `%s`%s at lines `%d-%d`\n", fn.Name, receiver, fn.StartLine, fn.EndLine)) + } + + for _, fn := range doc.Functions { + b.WriteString("\n## `" + fn.Name + "`\n\n") + b.WriteString("- Signature: `" + fn.Signature + "`\n") + b.WriteString(fmt.Sprintf("- Lines: `%d-%d`\n", fn.StartLine, fn.EndLine)) + if fn.ReceiverType != "" { + b.WriteString("- Receiver: `" + sanitizeInline(fn.ReceiverType) + "`") + if fn.Receiver != "" { + b.WriteString(fmt.Sprintf(" bound as `%s`", fn.Receiver)) + } + b.WriteString("\n") + } + b.WriteString("- Effect: " + fn.Effect + "\n") + if fn.DocComment != "" { + b.WriteString("- Native doc comment: `" + sanitizeInline(fn.DocComment) + "`\n") + } + + b.WriteString("\n### Relations\n\n") + if len(fn.PotentialLocal) > 0 { + b.WriteString("- Local package calls: `" + strings.Join(fn.PotentialLocal, "`, `") + "`\n") + } else { + b.WriteString("- Local package calls: none detected from simple call-name matching.\n") + } + if len(fn.ExternalCalls) > 0 { + b.WriteString("- External or unresolved calls: `" + strings.Join(fn.ExternalCalls, "`, `") + "`\n") + } else { + b.WriteString("- External or unresolved calls: none detected.\n") + } + if len(fn.PossibleCallers) > 0 { + b.WriteString("- Possible name-matched callers in scanned scope: `" + strings.Join(fn.PossibleCallers, "`, `") + "`\n") + } else { + b.WriteString("- Possible name-matched callers in scanned scope: none detected.\n") + } + + b.WriteString("\n### Control Flow\n\n") + if len(fn.ControlFlow) == 0 { + b.WriteString("No notable branch/loop/defer/return items were extracted.\n") + } else { + for _, item := range fn.ControlFlow { + b.WriteString("- " + item + "\n") + } + } + + b.WriteString("\n### Literal And Keyword Touchpoints\n\n") + if len(fn.Literals) == 0 { + b.WriteString("No literals or keyword literals (`true`, `false`, `nil`) were extracted from the body.\n") + } else { + for _, lit := range fn.Literals { + b.WriteString(fmt.Sprintf("- L%d `%s` = `%s`\n", lit.Line, lit.Kind, sanitizeInline(lit.Value))) + } + } + + b.WriteString("\n### Line-Level Operating Logic\n\n") + lineNotes := lineNoteMap(fn.Statements) + for offset, raw := range fn.SourceLines { + lineNo := fn.StartLine + offset + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + continue + } + note := explainLine(trimmed, lineNo, lineNotes) + b.WriteString(fmt.Sprintf("- L%d: `%s`", lineNo, sanitizeInline(trimmed))) + if note != "" { + b.WriteString(" -> " + note) + } + b.WriteString("\n") + } + } + + return b.String() +} + +func renderIndexReadme(outRel string, docs []*FileDoc) string { + var b strings.Builder + b.WriteString("# Go Volume Server Translation Docs\n\n") + b.WriteString("Generated reference set for translating the Go SeaweedFS volume server into the Rust `seaweed-volume` crate.\n\n") + b.WriteString("- Generated at: `" + time.Now().Format(time.RFC3339) + "`\n") + b.WriteString(fmt.Sprintf("- Markdown files: `%d`\n", len(docs))) + b.WriteString("- Scope: `weed/command/volume.go`, selected `weed/server` volume-server files, and runtime files under `weed/storage`, `weed/images`, `weed/security`, and `weed/stats`.\n") + b.WriteString("- Output root: `" + sanitizeInline(outRel) + "`\n\n") + + groups := map[string][]*FileDoc{} + groupOrder := []string{"command", "server", "storage", "images", "security", "stats"} + for _, doc := range docs { + group := strings.Split(doc.RelPath, "/")[1] + groups[group] = append(groups[group], doc) + } + + for _, group := range groupOrder { + items := groups[group] + if len(items) == 0 { + continue + } + sort.Slice(items, func(i, j int) bool { return items[i].RelPath < items[j].RelPath }) + b.WriteString("## " + strings.Title(group) + "\n\n") + for _, doc := range items { + target := filepath.ToSlash(filepath.Join(outRel, doc.RelPath+".md")) + b.WriteString("- `" + doc.RelPath + "` -> `" + target + "`") + if len(doc.RustCounterpart) > 0 { + b.WriteString(" | Rust: `" + strings.Join(doc.RustCounterpart, "`, `") + "`") + } + b.WriteString("\n") + } + b.WriteString("\n") + } + return b.String() +} + +func lineNoteMap(statements []StmtInfo) map[int]string { + notes := map[int]string{} + for _, stmt := range statements { + if stmt.Summary != "" { + notes[stmt.StartLine] = stmt.Summary + } + } + return notes +} + +func explainLine(line string, lineNo int, notes map[int]string) string { + if note, ok := notes[lineNo]; ok { + return note + } + switch { + case strings.HasPrefix(line, "func "): + return "function signature header" + case strings.HasPrefix(line, "//"): + return "comment line" + case strings.HasPrefix(line, "/*") || strings.HasPrefix(line, "*/"): + return "comment block boundary" + case line == "{" || line == "}" || line == "})" || line == "};": + return "block boundary" + case strings.HasPrefix(line, "else"): + return "alternate control-flow branch" + case strings.HasPrefix(line, "case "): + return "switch/select case label" + case strings.HasPrefix(line, "default:"): + return "default case label" + case strings.HasPrefix(line, "return"): + return "returns from the function" + case strings.HasPrefix(line, "defer "): + return "registers deferred work for function exit" + case strings.HasPrefix(line, "go "): + return "starts a goroutine" + case strings.HasPrefix(line, "if "): + return "conditional check" + case strings.HasPrefix(line, "for "): + return "loop header" + case strings.HasPrefix(line, "switch "): + return "switch header" + case strings.HasPrefix(line, "select "): + return "channel select header" + case strings.Contains(line, ":="): + return "declares and assigns local state" + case looksLikeAssignment(line): + return "updates existing state" + case strings.HasSuffix(line, ")") || strings.HasSuffix(line, "},") || strings.HasSuffix(line, "})"): + return "executes a call or composite literal line" + default: + return "continuation or structural line" + } +} + +func looksLikeAssignment(line string) bool { + if strings.Contains(line, "==") || strings.Contains(line, ">=") || strings.Contains(line, "<=") || strings.Contains(line, "!=") { + return false + } + if strings.Contains(line, "=") { + return true + } + return false +} + +func rustCounterparts(rel string) []string { + switch { + case rel == "weed/command/volume.go": + return []string{"seaweed-volume/src/config.rs", "seaweed-volume/src/main.rs"} + case strings.HasPrefix(rel, "weed/images/"): + return []string{"seaweed-volume/src/images.rs"} + case strings.HasPrefix(rel, "weed/security/"): + return []string{"seaweed-volume/src/security.rs"} + case strings.HasPrefix(rel, "weed/stats/"): + return []string{"seaweed-volume/src/metrics.rs"} + case rel == "weed/server/common.go": + return []string{"seaweed-volume/src/server/handlers.rs", "seaweed-volume/src/server/volume_server.rs", "seaweed-volume/src/main.rs"} + case rel == "weed/server/constants/volume.go": + return []string{"seaweed-volume/src/server/mod.rs", "seaweed-volume/src/server/volume_server.rs"} + case rel == "weed/server/volume_server.go": + return []string{"seaweed-volume/src/server/volume_server.rs", "seaweed-volume/src/server/heartbeat.rs", "seaweed-volume/src/main.rs"} + case strings.HasPrefix(rel, "weed/server/volume_server_handlers"): + return []string{"seaweed-volume/src/server/handlers.rs", "seaweed-volume/src/server/volume_server.rs"} + case strings.HasPrefix(rel, "weed/server/volume_grpc_"): + return []string{"seaweed-volume/src/server/grpc_server.rs", "seaweed-volume/src/server/heartbeat.rs"} + case rel == "weed/server/volume_server_ui/templates.go": + return []string{"seaweed-volume/src/server/volume_server.rs"} + case rel == "weed/storage/disk_location.go" || rel == "weed/storage/disk_location_ec.go": + return []string{"seaweed-volume/src/storage/disk_location.rs"} + case strings.HasPrefix(rel, "weed/storage/erasure_coding/"): + name := filepath.Base(rel) + switch name { + case "ec_decoder.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_decoder.rs"} + case "ec_encoder.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_encoder.rs"} + case "ec_locate.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_locate.rs"} + case "ec_shard.go", "ec_shard_info.go", "ec_shards_info.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_shard.rs"} + default: + return []string{"seaweed-volume/src/storage/erasure_coding/ec_volume.rs", "seaweed-volume/src/storage/erasure_coding/mod.rs"} + } + case strings.HasPrefix(rel, "weed/storage/needle/"): + name := filepath.Base(rel) + switch name { + case "crc.go": + return []string{"seaweed-volume/src/storage/needle/crc.rs"} + case "volume_ttl.go": + return []string{"seaweed-volume/src/storage/needle/ttl.rs"} + default: + return []string{"seaweed-volume/src/storage/needle/needle.rs", "seaweed-volume/src/storage/needle/mod.rs"} + } + case rel == "weed/storage/needle_map.go" || strings.HasPrefix(rel, "weed/storage/needle_map/") || strings.HasPrefix(rel, "weed/storage/needle_map_"): + name := filepath.Base(rel) + if strings.Contains(name, "compact_map") { + return []string{"seaweed-volume/src/storage/needle_map/compact_map.rs"} + } + return []string{"seaweed-volume/src/storage/needle_map.rs"} + case strings.HasPrefix(rel, "weed/storage/store"): + return []string{"seaweed-volume/src/storage/store.rs"} + case strings.HasPrefix(rel, "weed/storage/super_block/"): + return []string{"seaweed-volume/src/storage/super_block.rs"} + case strings.HasPrefix(rel, "weed/storage/types/"): + return []string{"seaweed-volume/src/storage/types.rs"} + case strings.HasPrefix(rel, "weed/storage/backend/s3_backend/"): + return []string{"seaweed-volume/src/remote_storage/s3.rs", "seaweed-volume/src/remote_storage/s3_tier.rs"} + case strings.HasPrefix(rel, "weed/storage/backend/"): + return []string{"seaweed-volume/src/storage/volume.rs", "seaweed-volume/src/storage/mod.rs"} + case strings.HasPrefix(rel, "weed/storage/idx/"): + return []string{"seaweed-volume/src/storage/idx/mod.rs"} + case strings.HasPrefix(rel, "weed/storage/volume"): + return []string{"seaweed-volume/src/storage/volume.rs"} + case strings.HasPrefix(rel, "weed/storage/"): + return []string{"seaweed-volume/src/storage/mod.rs"} + default: + return nil + } +} + +func cleanDocComment(group *ast.CommentGroup) string { + if group == nil { + return "" + } + text := strings.TrimSpace(group.Text()) + return strings.Join(strings.Fields(text), " ") +} + +func nodeString(fset *token.FileSet, node any) string { + if node == nil { + return "" + } + var buf bytes.Buffer + if err := printer.Fprint(&buf, fset, node); err != nil { + return "" + } + return buf.String() +} + +func fieldListString(fset *token.FileSet, fields *ast.FieldList) string { + if fields == nil { + return "" + } + var parts []string + for _, field := range fields.List { + names := make([]string, 0, len(field.Names)) + for _, name := range field.Names { + names = append(names, name.Name) + } + typeText := sanitizeInline(nodeString(fset, field.Type)) + if len(names) == 0 { + parts = append(parts, typeText) + continue + } + parts = append(parts, strings.Join(names, ", ")+" "+typeText) + } + return strings.Join(parts, ", ") +} + +func joinNodes(fset *token.FileSet, nodes []ast.Expr) string { + parts := make([]string, 0, len(nodes)) + for _, node := range nodes { + if node == nil { + continue + } + text := nodeString(fset, node) + if text != "" { + parts = append(parts, text) + } + } + return strings.Join(parts, ", ") +} + +func splitLines(src string) []string { + src = strings.ReplaceAll(src, "\r\n", "\n") + src = strings.ReplaceAll(src, "\r", "\n") + return strings.Split(src, "\n") +} + +func normalizeReceiverType(receiver string) string { + receiver = strings.TrimPrefix(receiver, "*") + receiver = strings.ReplaceAll(receiver, " ", "") + return receiver +} + +func simpleCallName(expr ast.Expr) string { + switch e := expr.(type) { + case *ast.Ident: + return e.Name + case *ast.SelectorExpr: + return e.Sel.Name + case *ast.IndexExpr: + return simpleCallName(e.X) + case *ast.IndexListExpr: + return simpleCallName(e.X) + case *ast.ParenExpr: + return simpleCallName(e.X) + default: + return "" + } +} + +func simpleNameFromDisplay(display string) string { + if strings.Contains(display, ".") { + parts := strings.Split(display, ".") + return parts[len(parts)-1] + } + if strings.Contains(display, "(") { + return strings.TrimSpace(strings.Split(display, "(")[0]) + } + return display +} + +func sortedKeys(set map[string]struct{}) []string { + items := make([]string, 0, len(set)) + for key := range set { + items = append(items, key) + } + sort.Strings(items) + return items +} + +func dedupeKeepOrder(items []string) []string { + var out []string + seen := map[string]struct{}{} + for _, item := range items { + if _, ok := seen[item]; ok { + continue + } + out = append(out, item) + seen[item] = struct{}{} + } + return out +} + +func sanitizeInline(s string) string { + s = strings.TrimSpace(s) + s = strings.ReplaceAll(s, "\n", " ") + s = strings.Join(strings.Fields(s), " ") + s = strings.ReplaceAll(s, "`", "'") + return s +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok b/seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok new file mode 100644 index 000000000..5f8b79583 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json b/seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json new file mode 100644 index 000000000..9df5c3075 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "a1ca49de5384445b68ade7d72f31f0379c199943" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.gitattributes b/seaweed-volume/vendor/reed-solomon-erasure/.gitattributes new file mode 100644 index 000000000..1af754e96 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.gitattributes @@ -0,0 +1,3 @@ +BackBlaze_JavaReedSolomon/* linguist-vendored +KlausPost_reedsolomon/* linguist-vendored +NicolasT_reedsolomon/* linguist-vendored \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.gitignore b/seaweed-volume/vendor/reed-solomon-erasure/.gitignore new file mode 100644 index 000000000..e9e21997b --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.gitignore @@ -0,0 +1,2 @@ +/target/ +/Cargo.lock diff --git a/seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md b/seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md new file mode 100644 index 000000000..5a86a1329 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md @@ -0,0 +1,181 @@ +## 6.0.0 +- Use LruCache instead of InversionTree for caching data decode matrices + - See [PR #104](https://github.com/rust-rse/reed-solomon-erasure/pull/104) +- Minor code duplication + - See [PR #102](https://github.com/rust-rse/reed-solomon-erasure/pull/102) +- Dependencies update + - Updated `smallvec` from `0.6.1` to `1.8.0` + +## 5.0.3 +- Fixed cross build bug for aarch64 with simd-accel + - See [PR #100](https://github.com/rust-rse/reed-solomon-erasure/pull/100) + +## 5.0.2 +* Add support for `RUST_REED_SOLOMON_ERASURE_ARCH` environment variable and stop using `native` architecture for SIMD code + - See [PR #98](https://github.com/rust-rse/reed-solomon-erasure/pull/98) + +## 5.0.1 +- The `simd-accel` feature now builds on M1 Macs + - See [PR #92](https://github.com/rust-rse/reed-solomon-erasure/pull/92) +- Minor code cleanup + +## 5.0.0 +- Merged several PRs +- Not fully reviewed as I am no longer maintaining this crate + +## 4.0.2 +- Updated build.rs to respect RUSTFLAGS's target-cpu if available + - See [PR #75](https://github.com/darrenldl/reed-solomon-erasure/pull/75) +- Added AVX512 support + - See [PR #69](https://github.com/darrenldl/reed-solomon-erasure/pull/69) +- Disabled SIMD acceleration when MSVC is being used to build the library + - See [PR #67](https://github.com/darrenldl/reed-solomon-erasure/pull/67) +- Dependencies update + - Updated `smallvec` from `0.6` to `1.2` + +## 4.0.1 +- Updated SIMD C code for Windows compatibility + - Removed include of `unistd.h` in `simd_c/reedsolomon.c` + - Removed GCC `nonnull` attribute in `simd_c/reedsolomon.h` + - See PR [#63](https://github.com/darrenldl/reed-solomon-erasure/pull/63) [#64](https://github.com/darrenldl/reed-solomon-erasure/pull/64) for details +- Replaced use of `libc::uint8_t` in `src/galois_8.rs` with `u8` + +## 4.0.0 +- Major API restructure: removed `Shard` type in favor of generic functions +- The logic of this crate is now generic over choice of finite field +- The SIMD acceleration feature for GF(2^8) is now activated with the `simd-accel` Cargo feature. Pure-rust behavior is default. +- Ran rustfmt +- Adds a GF(2^16) implementation + +## 3.1.2 (not published) +- Doc fix + - Added space before parantheses in code comments and documentation +- Disabled SIMD C code for Android and iOS targets entirely + +## 3.1.1 +- Fixed `Matrix::augment` + - The error checking code was incorrect + - Since this method is used in internal code only, and the only use case is a correct use case, the error did not lead to any bugs +- Fixed benchmark data + - Previously used MB=10^6 bytes while I should have used MB=2^20 bytes + - Table in README has been updated accordingly + - The `>= 2.1.0` data is obtained by measuring again with the corrected `rse-benchmark` code + - The `2.0.X` and `1.X.X` data are simply adjusted by mutiplying `10^6` then dividing by `2^20` +- Dependencies update + - Updated `rand` from `0.4` to `0.5.4` +- Added special handling in `build.rs` for CC options on Android and iOS + - `-march=native` is not available for GCC on Android, see issue #23 + +## 3.1.0 +- Impl'd `std::error::Error` for `reed_solomon_erasure::Error` and `reed_solomon_erasure::SBSError` + - See issue [#17](https://github.com/darrenldl/reed-solomon-erasure/issues/17), suggested by [DrPeterVanNostrand](https://github.com/DrPeterVanNostrand) +- Added fuzzing suite + - No code changes due to this as no bugs were found +- Upgraded InversionTree QuickCheck test + - No code changes due to this as no bugs were found +- Upgraded test suite for main codec methods (e.g. encode, reconstruct) + - A lot of heavy QuickCheck tests were added + - No code changes due to this as no bugs were found +- Upgraded test suite for ShardByShard methods + - A lot of heavy QuickCheck tests were added + - No code changes due to this as no bugs were found +- Minor code refactoring in `reconstruct_internal` method + - This means `reconstruct` and related methods are slightly more optimized + +## 3.0.3 +- Added QuickCheck tests to the test suite + - InversionTree is heavily tested now +- No code changes as no bugs were found +- Deps update + - Updated rayon from 0.9 to 1.0 + +## 3.0.2 +- Same as 3.0.1, but 3.0.1 had unapplied changes + +## 3.0.1 (yanked) +- Updated doc for `with_buffer` variants of verifying methods + - Stated explicitly that the buffer contains the correct parity shards after a successful call +- Added tests for the above statement + +## 3.0.0 +- Added `with_buffer` variants for verifying methods + - This gives user the option of reducing heap allocation(s) +- Core code clean up, improvements, and review, added more AUDIT comments +- Improved shard utils +- Added code to remove leftover parity shards in `reconstruct_data_shards` + - This means one fewer gotcha of using the methods +- `ShardByShard` code review and overhaul +- `InversionTree` code review and improvements + +## 2.4.0 +- Added more flexibility for `convert_2D_slices` macro + - Now accepts expressions rather than just identifiers + - The change requires change of syntax + +## 2.3.3 +- Replaced all slice splitting functions in `misc_utils` with std lib ones or rayon ones + - This means there are fewer heap allocations in general + +## 2.3.2 +- Made `==`(`eq`) for `ReedSolomon` more reasonable + - Previously `==` would compare + - data shard count + - parity shard count + - total shard count + - internal encoding matrix + - internal `ParallelParam` + - Now it only compares + - data shard count + - parity shard count + +## 2.3.1 +- Added info on encoding behaviour to doc + +## 2.3.0 +- Made Reed-Solomon codec creation methods return error instead of panic when shard numbers are not correct + +## 2.2.0 +- Fixed SBS error checking code +- Documentation fixes and polishing +- Renamed `Error::InvalidShardsIndicator` to `Error::InvalidShardFlags` +- Added more details to documentation on error handling +- Error handling code overhaul and checks for all method variants +- Dead commented out code cleanup and indent fix + +## 2.1.0 +- Added Nicolas's SIMD C code files, gaining major speedup on supported CPUs +- Added support for "shard by shard" encoding, allowing easier streamed encoding +- Added functions for shard by shard encoding + +## 2.0.0 +- Complete rewrite of most code following Klaus Post's design +- Added optimsations (parallelism, loop unrolling) +- 4-5x faster than `1.X.X` + +## 1.1.1 +- Documentation polish +- Added documentation badge to README +- Optimised internal matrix related operations + - This largely means `decode_missing` is faster + +## 1.1.0 +- Added more helper functions +- Added more tests + +## 1.0.1 +- Added more tests +- Fixed decode_missing + - Previously may reconstruct the missing shards with incorrect length + +## 1.0.0 +- Added more tests +- Added integration with Codecov (via kcov) +- Code refactoring +- Added integration with Coveralls (via kcov) + +## 0.9.1 +- Code restructuring +- Added documentation + +## 0.9.0 +- Base version diff --git a/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml new file mode 100644 index 000000000..a6171580a --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml @@ -0,0 +1,87 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "reed-solomon-erasure" +version = "6.0.0" +authors = ["Darren Ldl "] +build = "build.rs" +exclude = [ + "appveyor.yml", + ".travis.yml", +] +description = "Rust implementation of Reed-Solomon erasure coding" +homepage = "https://github.com/darrenldl/reed-solomon-erasure" +documentation = "https://docs.rs/reed-solomon-erasure" +readme = "README.md" +keywords = [ + "reed-solomon", + "erasure", +] +categories = ["encoding"] +license = "MIT" +repository = "https://github.com/darrenldl/reed-solomon-erasure" + +[[bench]] +name = "reconstruct" + +[dependencies.libc] +version = "0.2" +optional = true + +[dependencies.libm] +version = "0.2.1" + +[dependencies.lru] +version = "0.16.3" + +[dependencies.parking_lot] +version = "0.11.2" +optional = true + +[dependencies.smallvec] +version = "1.2" + +[dependencies.spin] +version = "0.9.2" +features = ["spin_mutex"] +default-features = false + +[dev-dependencies.quickcheck] +version = "0.9" + +[dev-dependencies.rand] +version = "0.7.2" + +[build-dependencies.cc] +version = "1.0" +optional = true + +[features] +default = ["std"] +simd-accel = [ + "cc", + "libc", +] +std = ["parking_lot"] + +[badges.appveyor] +repository = "darrenldl/reed-solomon-erasure" + +[badges.codecov] +repository = "darrenldl/reed-solomon-erasure" + +[badges.coveralls] +repository = "darrenldl/reed-solomon-erasure" + +[badges.travis-ci] +repository = "darrenldl/reed-solomon-erasure" diff --git a/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig new file mode 100644 index 000000000..e9cbc8cf9 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig @@ -0,0 +1,56 @@ +[package] +name= "reed-solomon-erasure" +version = "6.0.0" +authors = ["Darren Ldl "] +edition = "2018" +build = "build.rs" +exclude = [ + "appveyor.yml", + ".travis.yml" +] + +description = "Rust implementation of Reed-Solomon erasure coding" + +documentation = "https://docs.rs/reed-solomon-erasure" +homepage= "https://github.com/darrenldl/reed-solomon-erasure" +repository= "https://github.com/darrenldl/reed-solomon-erasure" + +readme= "README.md" + +keywords= ["reed-solomon", "erasure"] + +categories= ["encoding"] + +license = "MIT" + +[features] +default = ["std"] # simd off by default +std = ["parking_lot"] +simd-accel = ["cc", "libc"] + +[badges] +travis-ci = { repository = "darrenldl/reed-solomon-erasure" } +appveyor= { repository = "darrenldl/reed-solomon-erasure" } +codecov = { repository = "darrenldl/reed-solomon-erasure" } +coveralls = { repository = "darrenldl/reed-solomon-erasure" } + +[dependencies] +libc = { version = "0.2", optional = true } +# `log2()` impl for `no_std` +libm = "0.2.1" +lru = "0.16.3" +# Efficient `Mutex` implementation for `std` environment +parking_lot = { version = "0.11.2", optional = true } +smallvec = "1.2" +# `Mutex` implementation for `no_std` environment with the same high-level API as `parking_lot` +spin = { version = "0.9.2", default-features = false, features = ["spin_mutex"] } + +[dev-dependencies] +rand = "0.7.2" +quickcheck = "0.9" + +[build-dependencies] +cc = { version = "1.0", optional = true } + +[[bench]] +name = "reconstruct" diff --git a/seaweed-volume/vendor/reed-solomon-erasure/LICENSE b/seaweed-volume/vendor/reed-solomon-erasure/LICENSE new file mode 100644 index 000000000..87c0c3787 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/LICENSE @@ -0,0 +1,24 @@ +MIT License + +Copyright (c) 2017 Darren Ldl +Copyright (c) 2015, 2016 Nicolas Trangez +Copyright (c) 2015 Klaus Post +Copyright (c) 2015 Backblaze + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/seaweed-volume/vendor/reed-solomon-erasure/README.md b/seaweed-volume/vendor/reed-solomon-erasure/README.md new file mode 100644 index 000000000..5d79fab7c --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/README.md @@ -0,0 +1,166 @@ +# reed-solomon-erasure +[![Build Status](https://travis-ci.org/darrenldl/reed-solomon-erasure.svg?branch=master)](https://travis-ci.org/darrenldl/reed-solomon-erasure) +[![Build status](https://ci.appveyor.com/api/projects/status/47c0emjoa9bhpjlb/branch/master?svg=true)](https://ci.appveyor.com/project/darrenldl/reed-solomon-erasure/branch/master) +[![codecov](https://codecov.io/gh/darrenldl/reed-solomon-erasure/branch/master/graph/badge.svg)](https://codecov.io/gh/darrenldl/reed-solomon-erasure) +[![Coverage Status](https://coveralls.io/repos/github/darrenldl/reed-solomon-erasure/badge.svg?branch=master)](https://coveralls.io/github/darrenldl/reed-solomon-erasure?branch=master) +[![Crates](https://img.shields.io/crates/v/reed-solomon-erasure.svg)](https://crates.io/crates/reed-solomon-erasure) +[![Documentation](https://docs.rs/reed-solomon-erasure/badge.svg)](https://docs.rs/reed-solomon-erasure) +[![dependency status](https://deps.rs/repo/github/darrenldl/reed-solomon-erasure/status.svg)](https://deps.rs/repo/github/darrenldl/reed-solomon-erasure) + +Rust implementation of Reed-Solomon erasure coding + +WASM builds are also available, see section **WASM usage** below for details + +This is a port of [BackBlaze's Java implementation](https://github.com/Backblaze/JavaReedSolomon), [Klaus Post's Go implementation](https://github.com/klauspost/reedsolomon), and [Nicolas Trangez's Haskell implementation](https://github.com/NicolasT/reedsolomon). + +Version `1.X.X` copies BackBlaze's implementation, and is less performant as there were fewer places where parallelism could be added. + +Version `>= 2.0.0` copies Klaus Post's implementation. The SIMD C code is copied from Nicolas Trangez's implementation with minor modifications. + +See [Notes](#notes) and [License](#license) section for details. + +## WASM usage + +See [here](wasm/README.md) for details + +## Rust usage +Add the following to your `Cargo.toml` for the normal version (pure Rust version) +```toml +[dependencies] +reed-solomon-erasure = "4.0" +``` +or the following for the version which tries to utilise SIMD +```toml +[dependencies] +reed-solomon-erasure = { version = "4.0", features = [ "simd-accel" ] } +``` +and the following to your crate root +```rust +extern crate reed_solomon_erasure; +``` + +NOTE: `simd-accel` is tuned for Haswell+ processors on x86-64 and not in any way for other architectures, set +environment variable `RUST_REED_SOLOMON_ERASURE_ARCH` during build to force compilation of C code for specific architecture (`-march` flag in +GCC/Clang). Even on x86-64 you can achieve better performance by setting it to `native`, but it will stop running on +older CPUs, YMMV. + +## Example +```rust +#[macro_use(shards)] +extern crate reed_solomon_erasure; + +use reed_solomon_erasure::galois_8::ReedSolomon; +// or use the following for Galois 2^16 backend +// use reed_solomon_erasure::galois_16::ReedSolomon; + +fn main () { + let r = ReedSolomon::new(3, 2).unwrap(); // 3 data shards, 2 parity shards + + let mut master_copy = shards!( + [0, 1, 2, 3], + [4, 5, 6, 7], + [8, 9, 10, 11], + [0, 0, 0, 0], // last 2 rows are parity shards + [0, 0, 0, 0] + ); + + // Construct the parity shards + r.encode(&mut master_copy).unwrap(); + + // Make a copy and transform it into option shards arrangement + // for feeding into reconstruct_shards + let mut shards: Vec<_> = master_copy.iter().cloned().map(Some).collect(); + + // We can remove up to 2 shards, which may be data or parity shards + shards[0] = None; + shards[4] = None; + + // Try to reconstruct missing shards + r.reconstruct(&mut shards).unwrap(); + + // Convert back to normal shard arrangement + let result: Vec<_> = shards.into_iter().filter_map(|x| x).collect(); + + assert!(r.verify(&result).unwrap()); + assert_eq!(master_copy, result); +} +``` + +## Benchmark it yourself +You can test performance under different configurations quickly (e.g. data parity shards ratio, parallel parameters) +by cloning this repo: https://github.com/darrenldl/rse-benchmark + +`rse-benchmark` contains a copy of this library (usually a fully functional dev version), so you only need to adjust `main.rs` +then do `cargo run --release` to start the benchmark. + +## Performance +Version `1.X.X`, `2.0.0` do not utilise SIMD. + +Version `2.1.0` onward uses Nicolas's C files for SIMD operations. + +Machine: laptop with `Intel(R) Core(TM) i5-3337U CPU @ 1.80GHz (max 2.70GHz) 2 Cores 4 Threads` + +Below shows the result of one of the test configurations, other configurations show similar results in terms of ratio. + +|Configuration| Klaus Post's | >= 2.1.0 && < 4.0.0 | 2.0.X | 1.X.X | +|---|---|---|---|---| +| 10x2x1M | ~7800MB/s |~4500MB/s | ~1000MB/s | ~240MB/s | + +Versions `>= 4.0.0` have not been benchmarked thoroughly yet + +## Changelog +[Changelog](CHANGELOG.md) + +## Contributions +Contributions are welcome. Note that by submitting contributions, you agree to license your work under the same license used by this project as stated in the LICENSE file. + +## Credits +#### Library overhaul and Galois 2^16 backend +Many thanks to the following people for overhaul of the library and introduction of Galois 2^16 backend + + - [@drskalman](https://github.com/drskalman) + + - Jeff Burdges [@burdges](https://github.com/burdges) + + - Robert Habermeier [@rphmeier](https://github.com/rphmeier) + +#### WASM builds +Many thanks to Nazar Mokrynskyi [@nazar-pc](https://github.com/nazar-pc) for submitting his package for WASM builds + +He is the original author of the files stored in `wasm` folder. The files may have been modified by me later. + +#### AVX512 support +Many thanks to [@sakridge](https://github.com/sakridge) for adding support for AVX512 (see [PR #69](https://github.com/darrenldl/reed-solomon-erasure/pull/69)) + +#### build.rs improvements +Many thanks to [@ryoqun](https://github.com/ryoqun) for improving the usability of the library in the context of cross-compilation (see [PR #75](https://github.com/darrenldl/reed-solomon-erasure/pull/75)) + +#### no_std support +Many thanks to Nazar Mokrynskyi [@nazar-pc](https://github.com/nazar-pc) for adding `no_std` support (see [PR #90](https://github.com/darrenldl/reed-solomon-erasure/pull/90)) + +#### Testers +Many thanks to the following people for testing and benchmarking on various platforms + + - Laurențiu Nicola [@lnicola](https://github.com/lnicola/) (platforms: Linux, Intel) + + - Roger Andersen [@hexjelly](https://github.com/hexjelly) (platforms: Windows, AMD) + +## Notes +#### Code quality review +If you'd like to evaluate the quality of this library, you may find audit comments helpful. + +Simply search for "AUDIT" to see the dev notes that are aimed at facilitating code reviews. + +#### Implementation notes +The `1.X.X` implementation mostly copies [BackBlaze's Java implementation](https://github.com/Backblaze/JavaReedSolomon). + +`2.0.0` onward mostly copies [Klaus Post's Go implementation](https://github.com/klauspost/reedsolomon), and copies C files from [Nicolas Trangez's Haskell implementation](https://github.com/NicolasT/reedsolomon). + +The test suite for all versions copies [Klaus Post's Go implementation](https://github.com/klauspost/reedsolomon) as basis. + +## License +#### Nicolas Trangez's Haskell Reed-Solomon implementation +The C files for SIMD operations are copied (with no/minor modifications) from [Nicolas Trangez's Haskell implementation](https://github.com/NicolasT/reedsolomon), and are under the same MIT License as used by NicolasT's project + +#### TL;DR +All files are released under the MIT License diff --git a/seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs b/seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs new file mode 100644 index 000000000..e9d6b6f07 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs @@ -0,0 +1,108 @@ +#![feature(test)] + +extern crate test; + +use { + rand::{prelude::*, Rng}, + reed_solomon_erasure::galois_8::Field, + test::Bencher, +}; + +type ReedSolomon = reed_solomon_erasure::ReedSolomon; + +const SHARD_SIZE: usize = 1024; + +fn run_reconstruct_bench(bencher: &mut Bencher, num_data_shards: usize, num_parity_shards: usize) { + let mut rng = rand::thread_rng(); + let mut shards = vec![vec![0u8; SHARD_SIZE]; num_data_shards + num_parity_shards]; + for shard in &mut shards[..num_data_shards] { + rng.fill(&mut shard[..]); + } + let reed_solomon = ReedSolomon::new(num_data_shards, num_parity_shards).unwrap(); + reed_solomon.encode(&mut shards[..]).unwrap(); + let shards: Vec<_> = shards.into_iter().map(Some).collect(); + + bencher.iter(|| { + let mut shards = shards.clone(); + for _ in 0..num_parity_shards { + *shards.choose_mut(&mut rng).unwrap() = None; + } + reed_solomon.reconstruct(&mut shards[..]).unwrap(); + assert!(shards.iter().all(Option::is_some)); + }); +} + +#[bench] +fn bench_reconstruct_2_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 2, 2) +} + +#[bench] +fn bench_reconstruct_4_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 4, 2) +} + +#[bench] +fn bench_reconstruct_4_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 4, 4) +} + +#[bench] +fn bench_reconstruct_8_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 8, 2) +} + +#[bench] +fn bench_reconstruct_8_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 8, 4) +} + +#[bench] +fn bench_reconstruct_8_8(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 8, 8) +} + +#[bench] +fn bench_reconstruct_16_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 2) +} + +#[bench] +fn bench_reconstruct_16_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 4) +} + +#[bench] +fn bench_reconstruct_16_8(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 8) +} + +#[bench] +fn bench_reconstruct_16_16(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 16) +} + +#[bench] +fn bench_reconstruct_32_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 2) +} + +#[bench] +fn bench_reconstruct_32_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 4) +} + +#[bench] +fn bench_reconstruct_32_8(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 8) +} + +#[bench] +fn bench_reconstruct_32_16(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 16) +} + +#[bench] +fn bench_reconstruct_32_32(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 32) +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/build.rs b/seaweed-volume/vendor/reed-solomon-erasure/build.rs new file mode 100644 index 000000000..de9c5f18e --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/build.rs @@ -0,0 +1,196 @@ +use std::env; +use std::fs::File; +use std::io::Write; +use std::path::Path; + +#[cfg(feature = "simd-accel")] +extern crate cc; + +const FIELD_SIZE: usize = 256; + +const GENERATING_POLYNOMIAL: usize = 29; + +fn gen_log_table(polynomial: usize) -> [u8; FIELD_SIZE] { + let mut result: [u8; FIELD_SIZE] = [0; FIELD_SIZE]; + let mut b: usize = 1; + + for log in 0..FIELD_SIZE - 1 { + result[b] = log as u8; + + b = b << 1; + + if FIELD_SIZE <= b { + b = (b - FIELD_SIZE) ^ polynomial; + } + } + + result +} + +const EXP_TABLE_SIZE: usize = FIELD_SIZE * 2 - 2; + +fn gen_exp_table(log_table: &[u8; FIELD_SIZE]) -> [u8; EXP_TABLE_SIZE] { + let mut result: [u8; EXP_TABLE_SIZE] = [0; EXP_TABLE_SIZE]; + + for i in 1..FIELD_SIZE { + let log = log_table[i] as usize; + result[log] = i as u8; + result[log + FIELD_SIZE - 1] = i as u8; + } + + result +} + +fn multiply(log_table: &[u8; FIELD_SIZE], exp_table: &[u8; EXP_TABLE_SIZE], a: u8, b: u8) -> u8 { + if a == 0 || b == 0 { + 0 + } else { + let log_a = log_table[a as usize]; + let log_b = log_table[b as usize]; + let log_result = log_a as usize + log_b as usize; + exp_table[log_result] + } +} + +fn gen_mul_table( + log_table: &[u8; FIELD_SIZE], + exp_table: &[u8; EXP_TABLE_SIZE], +) -> [[u8; FIELD_SIZE]; FIELD_SIZE] { + let mut result: [[u8; FIELD_SIZE]; FIELD_SIZE] = [[0; 256]; 256]; + + for a in 0..FIELD_SIZE { + for b in 0..FIELD_SIZE { + result[a][b] = multiply(log_table, exp_table, a as u8, b as u8); + } + } + + result +} + +fn gen_mul_table_half( + log_table: &[u8; FIELD_SIZE], + exp_table: &[u8; EXP_TABLE_SIZE], +) -> ([[u8; 16]; FIELD_SIZE], [[u8; 16]; FIELD_SIZE]) { + let mut low: [[u8; 16]; FIELD_SIZE] = [[0; 16]; FIELD_SIZE]; + let mut high: [[u8; 16]; FIELD_SIZE] = [[0; 16]; FIELD_SIZE]; + + for a in 0..low.len() { + for b in 0..low.len() { + let mut result = 0; + if !(a == 0 || b == 0) { + let log_a = log_table[a]; + let log_b = log_table[b]; + result = exp_table[log_a as usize + log_b as usize]; + } + if (b & 0x0F) == b { + low[a][b] = result; + } + if (b & 0xF0) == b { + high[a][b >> 4] = result; + } + } + } + (low, high) +} + +macro_rules! write_table { + (1D => $file:ident, $table:ident, $name:expr, $type:expr) => {{ + let len = $table.len(); + let mut table_str = String::from(format!("pub static {}: [{}; {}] = [", $name, $type, len)); + + for v in $table.iter() { + let str = format!("{}, ", v); + table_str.push_str(&str); + } + + table_str.push_str("];\n"); + + $file.write_all(table_str.as_bytes()).unwrap(); + }}; + (2D => $file:ident, $table:ident, $name:expr, $type:expr) => {{ + let rows = $table.len(); + let cols = $table[0].len(); + let mut table_str = String::from(format!( + "pub static {}: [[{}; {}]; {}] = [", + $name, $type, cols, rows + )); + + for a in $table.iter() { + table_str.push_str("["); + for b in a.iter() { + let str = format!("{}, ", b); + table_str.push_str(&str); + } + table_str.push_str("],\n"); + } + + table_str.push_str("];\n"); + + $file.write_all(table_str.as_bytes()).unwrap(); + }}; +} + +fn write_tables() { + let log_table = gen_log_table(GENERATING_POLYNOMIAL); + let exp_table = gen_exp_table(&log_table); + let mul_table = gen_mul_table(&log_table, &exp_table); + + let out_dir = env::var("OUT_DIR").unwrap(); + let dest_path = Path::new(&out_dir).join("table.rs"); + let mut f = File::create(&dest_path).unwrap(); + + write_table!(1D => f, log_table, "LOG_TABLE", "u8"); + write_table!(1D => f, exp_table, "EXP_TABLE", "u8"); + write_table!(2D => f, mul_table, "MUL_TABLE", "u8"); + + if cfg!(feature = "simd-accel") { + let (mul_table_low, mul_table_high) = gen_mul_table_half(&log_table, &exp_table); + + write_table!(2D => f, mul_table_low, "MUL_TABLE_LOW", "u8"); + write_table!(2D => f, mul_table_high, "MUL_TABLE_HIGH", "u8"); + } +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +fn compile_simd_c() { + let mut build = cc::Build::new(); + build.opt_level(3); + + match env::var("RUST_REED_SOLOMON_ERASURE_ARCH") { + Ok(arch) => { + // Use explicitly specified environment variable as architecture. + build.flag(&format!("-march={}", arch)); + } + Err(_error) => { + // On x86-64 enabling Haswell architecture unlocks useful instructions and improves performance + // dramatically while allowing it to run ony modern CPU. + match env::var("CARGO_CFG_TARGET_ARCH").unwrap().as_str(){ + "x86_64" => { build.flag(&"-march=haswell"); }, + _ => () + } + } + } + + build + .flag("-std=c11") + .file("simd_c/reedsolomon.c") + .compile("reedsolomon"); +} + +#[cfg(not(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +)))] +fn compile_simd_c() {} + +fn main() { + compile_simd_c(); + write_tables(); +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage b/seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage new file mode 100644 index 000000000..cab1bdf10 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage @@ -0,0 +1,26 @@ +GF256. = FiniteField(256) +R. = GF256[x] +ext_poly = R.irreducible_element(2,algorithm="first_lexicographic" ) +ExtField. = GF256.extension(ext_poly) +print ExtField +print len(ExtField) + +x^2 + a*x + a^7 + +e1 = (a^7 + a^6 + a^4 + a)*b + a^3 + a^2 + a + 1 +e2 = (a^7 + a^5 + a^2)*b + a^7 + a^4 + a^3 + a + +print "e1: ", e1 +print "e2: ", e2 + +print "e1 + e2: ", e1 + e2 +#(a^6 + a^5 + a^4 + a^2 + a)*b + a^7 + a^4 + a^2 + 1 + +print "e1 * e2: ", e1 * e2 +#(a^4 + a^2 + a + 1)*b + a^7 + a^5 + a^3 + a + +print "e1 / e2: ", e1 / e2 +#(a^7 + a^6 + a^5 + a^4 + a^3 + a^2 + 1)*b + a^6 + a^3 + a + +print "1/b: ", 1/b +#(a^4 + a^3 + a + 1)*b + a^5 + a^4 + a^2 + a \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c new file mode 100644 index 000000000..12a921100 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c @@ -0,0 +1,574 @@ +/* reedsolomon.c - SIMD-optimized Galois-field multiplication routines + * + * Copyright (c) 2015, 2016 Nicolas Trangez + * Copyright (c) 2015 Klaus Post + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE + */ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +//#if defined(__SSE2__) && __SSE2__ && defined(HAVE_EMMINTRIN_H) && HAVE_EMMINTRIN_H +//#ifdef __SSE2__ +#if defined(__SSE2__) && __SSE2__ +# define USE_SSE2 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_SSE2 0 +#endif + +//#if defined(__SSSE3__) && __SSSE3__ && defined(HAVE_TMMINTRIN_H) && HAVE_TMMINTRIN_H +//#ifdef __SSSE3__ +#if defined(__SSSE3__) && __SSSE3__ +# define USE_SSSE3 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_SSSE3 0 +#endif + +//#if defined(__AVX2__) && __AVX2__ && defined(HAVE_IMMINTRIN_H) && HAVE_IMMINTRIN_H +//#ifdef __AVX2__ +#if defined(__AVX2__) && __AVX2__ +# define USE_AVX2 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 32 +# include +#else +# define USE_AVX2 0 +#endif + + +#if defined(__AVX512F__) && __AVX512F__ +# define USE_AVX512 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 64 +# include +#else +# define USE_AVX512 0 +#endif + + +/*#if ((defined(__ARM_NEON__) && __ARM_NEON__) \ + || (defined(__ARM_NEON) && __ARM_NEON) \ + || (defined(__aarch64__) && __aarch64__)) \ + && defined(HAVE_ARM_NEON_H) && HAVE_ARM_NEON_H*/ +#if ((defined(__ARM_NEON__) && __ARM_NEON__) \ + || (defined(__ARM_NEON) && __ARM_NEON) \ + || (defined(__aarch64__) && __aarch64__)) +# define USE_ARM_NEON 1 +#undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_ARM_NEON 0 +#endif + +//#if defined(__ALTIVEC__) && __ALTIVEC__ && defined(HAVE_ALTIVEC_H) && HAVE_ALTIVEC_H +#if defined(__ALTIVEC__) && __ALTIVEC__ +# define USE_ALTIVEC 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_ALTIVEC 0 +#endif + +#ifndef VECTOR_SIZE +/* 'Generic' code */ +# define VECTOR_SIZE 16 +#endif + +# define USE_ALIGNED_ACCESS 0 +# define ALIGNED_ACCESS __attribute__((unused)) +# define UNALIGNED_ACCESS + +#include "reedsolomon.h" + +#if defined(HAVE_FUNC_ATTRIBUTE_HOT) && HAVE_FUNC_ATTRIBUTE_HOT +# define HOT_FUNCTION __attribute__((hot)) +#else +# define HOT_FUNCTION +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_CONST) && HAVE_FUNC_ATTRIBUTE_CONST +# define CONST_FUNCTION __attribute__((const)) +#else +# define CONST_FUNCTION +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE) && HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE +# define ALWAYS_INLINE inline __attribute__((always_inline)) +#else +# define ALWAYS_INLINE inline +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_FORCE_ALIGN_ARG_POINTER) && HAVE_FUNC_ATTRIBUTE_FORCE_ALIGN_ARG_POINTER +# define FORCE_ALIGN_ARG_POINTER __attribute__((force_align_arg_pointer)) +#else +# define FORCE_ALIGN_ARG_POINTER +#endif + +#define CONCAT_HELPER(a, b) a ## b +#define CONCAT(a, b) CONCAT_HELPER(a, b) + +typedef uint8_t v16u8v __attribute__((vector_size(16), aligned(1))); +typedef uint64_t v2u64v __attribute__((vector_size(16), aligned(1))); + +#define T(t, n) t n[VSIZE / 8 / sizeof(t)] +#define T1(t, n) t n + +#define VSIZE 128 +typedef union { + T(uint8_t, u8); + T(uint64_t, u64); +#if USE_SSE2 + T1(__m128i, m128i); +#endif +#if USE_ARM_NEON + T1(uint8x16_t, uint8x16); + T1(uint8x8x2_t, uint8x8x2); +#endif +#if USE_ALTIVEC + T1(__vector uint8_t, uint8x16); + T1(__vector uint64_t, uint64x2); +#endif + T1(v16u8v, v16u8); + T1(v2u64v, v2u64); +} v128 __attribute__((aligned(1))); +#undef VSIZE + +#define VSIZE 256 +typedef union { + T(uint8_t, u8); +#if USE_AVX2 + __m256i m256i; +#endif +} v256 __attribute__((aligned(1))); +#undef VSIZE + +#define VSIZE 512 +typedef union { + T(uint8_t, u8); +#if USE_AVX512 + __m512i m512i; +#endif +} v512 __attribute__((aligned(1))); + +#undef T +#undef T1 + +#if VECTOR_SIZE == 16 +typedef v128 v; +#elif VECTOR_SIZE == 32 +typedef v256 v; +#elif VECTOR_SIZE == 64 +typedef v512 v; +#else +# error Unsupported VECTOR_SIZE +#endif + +static ALWAYS_INLINE UNALIGNED_ACCESS v128 loadu_v128(const uint8_t *in) { +#if USE_SSE2 + const v128 result = { .m128i = _mm_loadu_si128((const __m128i *)in) }; +#else + v128 result; + memcpy(&result.u64, in, sizeof(result.u64)); +#endif + + return result; +} + +static ALWAYS_INLINE UNALIGNED_ACCESS v loadu_v(const uint8_t *in) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_loadu_si512((const __m512i *)in) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_loadu_si256((const __m256i *)in) }; +#else + const v128 result = loadu_v128(in); +#endif + + return result; +} + +static ALWAYS_INLINE ALIGNED_ACCESS v load_v(const uint8_t *in) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_load_si512((const __m512i *)in) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_load_si256((const __m256i *)in) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_load_si128((const __m128i *)in) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vld1q_u8(in) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = vec_ld(0, in) }; +#else + const v128 result = loadu_v128(in); +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v set1_epi8_v(const uint8_t c) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_set1_epi8(c) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_set1_epi8(c) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_set1_epi8(c) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vdupq_n_u8(c) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = { c, c, c, c, c, c, c, c, + c, c, c, c, c, c, c, c } }; +#else + uint64_t c2 = c, + tmp = (c2 << (7 * 8)) | + (c2 << (6 * 8)) | + (c2 << (5 * 8)) | + (c2 << (4 * 8)) | + (c2 << (3 * 8)) | + (c2 << (2 * 8)) | + (c2 << (1 * 8)) | + (c2 << (0 * 8)); + const v128 result = { .u64 = { tmp, tmp } }; +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v srli_epi64_v(const v in /*, const unsigned int n*/) { + // TODO: Hard code n to 4 to avoid build issues on M1 Macs (the + // `USE_ARM_NEON` path below) where apple clang is failing to + // recognize the constant `n`. + // + // See https://github.com/rust-rse/reed-solomon-erasure/pull/92 + // + #define n 4 +#if USE_AVX512 + const v512 result = { .m512i = _mm512_srli_epi64(in.m512i, n) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_srli_epi64(in.m256i, n) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_srli_epi64(in.m128i, n) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vshrq_n_u8(in.uint8x16, n) }; +#elif USE_ALTIVEC +# if RS_HAVE_VEC_VSRD + const v128 shift = { .v2u64 = { n, n } }, + result = { .uint64x2 = vec_vsrd(in.v2u64, shift.v2u64) }; +# else + const v128 result = { .v2u64 = in.v2u64 >> n }; +# endif +#else + const v128 result = { .u64 = { in.u64[0] >> n, + in.u64[1] >> n } }; +#endif + #undef n + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v and_v(const v a, const v b) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_and_si512(a.m512i, b.m512i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_and_si256(a.m256i, b.m256i) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_and_si128(a.m128i, b.m128i) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vandq_u8(a.uint8x16, b.uint8x16) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = vec_and(a.uint8x16, b.uint8x16) }; +#else + const v128 result = { .v2u64 = a.v2u64 & b.v2u64 }; +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v xor_v(const v a, const v b) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_xor_si512(a.m512i, b.m512i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_xor_si256(a.m256i, b.m256i) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_xor_si128(a.m128i, b.m128i) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = veorq_u8(a.uint8x16, b.uint8x16) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = vec_xor(a.uint8x16, b.uint8x16) }; +#else + const v128 result = { .v2u64 = a.v2u64 ^ b.v2u64 }; +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v shuffle_epi8_v(const v vec, const v mask) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_shuffle_epi8(vec.m512i, mask.m512i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_shuffle_epi8(vec.m256i, mask.m256i) }; +#elif USE_SSSE3 + const v128 result = { .m128i = _mm_shuffle_epi8(vec.m128i, mask.m128i) }; +#elif USE_ARM_NEON +# if defined(RS_HAVE_VQTBL1Q_U8) && RS_HAVE_VQTBL1Q_U8 + const v128 result = { .uint8x16 = vqtbl1q_u8(vec.uint8x16, mask.uint8x16) }; +# else + /* There's no NEON instruction mapping 1-to-1 to _mm_shuffle_epi8, but + * this should have the same result... + */ + const v128 result = { .uint8x16 = vcombine_u8(vtbl2_u8(vec.uint8x8x2, + vget_low_u8(mask.uint8x16)), + vtbl2_u8(vec.uint8x8x2, + vget_high_u8(mask.uint8x16))) }; + +# endif +#elif USE_ALTIVEC + const v128 zeros = set1_epi8_v(0), + result = { .uint8x16 = vec_perm(vec.uint8x16, zeros.uint8x16, mask.uint8x16) }; +#elif defined(RS_HAVE_BUILTIN_SHUFFLE) && RS_HAVE_BUILTIN_SHUFFLE + const v16u8v zeros = { 0, 0, 0, 0, 0, 0, 0, 0 + , 0, 0, 0, 0, 0, 0, 0, 0 }; + const v128 result = { .v16u8 = __builtin_shuffle(vec.v16u8, zeros, mask.v16u8) }; +#else + v128 result = { .u64 = { 0, 0 } }; + +# define DO_BYTE(i) \ + result.u8[i] = mask.u8[i] & 0x80 ? 0 : vec.u8[mask.u8[i] & 0x0F]; + + DO_BYTE( 0); DO_BYTE( 1); DO_BYTE( 2); DO_BYTE( 3); + DO_BYTE( 4); DO_BYTE( 5); DO_BYTE( 6); DO_BYTE( 7); + DO_BYTE( 8); DO_BYTE( 9); DO_BYTE(10); DO_BYTE(11); + DO_BYTE(12); DO_BYTE(13); DO_BYTE(14); DO_BYTE(15); +#endif + + return result; +} + +static ALWAYS_INLINE UNALIGNED_ACCESS void storeu_v(uint8_t *out, const v vec) { +#if USE_AVX512 + _mm512_storeu_si512((__m512i *)out, vec.m512i); +#elif USE_AVX2 + _mm256_storeu_si256((__m256i *)out, vec.m256i); +#elif USE_SSE2 + _mm_storeu_si128((__m128i *)out, vec.m128i); +#else + memcpy(out, &vec.u64, sizeof(vec.u64)); +#endif +} + +static ALWAYS_INLINE ALIGNED_ACCESS void store_v(uint8_t *out, const v vec) { +#if USE_AVX512 + _mm512_store_si512((__m512i *)out, vec.m512i); +#elif USE_AVX2 + _mm256_store_si256((__m256i *)out, vec.m256i); +#elif USE_SSE2 + _mm_store_si128((__m128i *)out, vec.m128i); +#elif USE_ARM_NEON + vst1q_u8(out, vec.uint8x16); +#elif USE_ALTIVEC + vec_st(vec.uint8x16, 0, out); +#else + storeu_v(out, vec); +#endif +} + +static ALWAYS_INLINE CONST_FUNCTION v replicate_v128_v(const v128 vec) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_broadcast_i32x4(vec.m128i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_broadcastsi128_si256(vec.m128i) }; +#else + const v128 result = vec; +#endif + + return result; +} + + +//+build !noasm !appengine + +// Copyright 2015, Klaus Post, see LICENSE for details. + +// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf +// and http://jerasure.org/jerasure/gf-complete/tree/master + +/* +// func galMulSSSE3Xor(low, high, in, out []byte) +TEXT ·galMulSSSE3Xor(SB), 7, $0 + MOVQ low+0(FP),SI // SI: &low + MOVQ high+24(FP),DX // DX: &high + MOVOU (SI), X6 // X6 low + MOVOU (DX), X7 // X7: high + MOVQ $15, BX // BX: low mask + MOVQ BX, X8 + PXOR X5, X5 + MOVQ in+48(FP),SI // R11: &in + MOVQ in_len+56(FP),R9 // R9: len(in) + MOVQ out+72(FP), DX // DX: &out + PSHUFB X5, X8 // X8: lomask (unpacked) + SHRQ $4, R9 // len(in) / 16 + CMPQ R9 ,$0 + JEQ done_xor +loopback_xor: + MOVOU (SI),X0 // in[x] + MOVOU (DX),X4 // out[x] + MOVOU X0, X1 // in[x] + MOVOU X6, X2 // low copy + MOVOU X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + PXOR X4, X3 // X3: Result xor existing out + MOVOU X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback_xor +done_xor: + RET + +// func galMulSSSE3(low, high, in, out []byte) +TEXT ·galMulSSSE3(SB), 7, $0 + MOVQ low+0(FP),SI // SI: &low + MOVQ high+24(FP),DX // DX: &high + MOVOU (SI), X6 // X6 low + MOVOU (DX), X7 // X7: high + MOVQ $15, BX // BX: low mask + MOVQ BX, X8 + PXOR X5, X5 + MOVQ in+48(FP),SI // R11: &in + MOVQ in_len+56(FP),R9 // R9: len(in) + MOVQ out+72(FP), DX // DX: &out + PSHUFB X5, X8 // X8: lomask (unpacked) + SHRQ $4, R9 // len(in) / 16 + CMPQ R9 ,$0 + JEQ done +loopback: + MOVOU (SI),X0 // in[x] + MOVOU X0, X1 // in[x] + MOVOU X6, X2 // low copy + MOVOU X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + MOVOU X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback +done: + RET +*/ + +static ALWAYS_INLINE v reedsolomon_gal_mul_v( + const v low_mask_unpacked, + const v low_vector, + const v high_vector, + + v (*modifier)(const v new, const v old), + + const v in_x, + const v old) { + const v low_input = and_v(in_x, low_mask_unpacked), + in_x_shifted = srli_epi64_v(in_x /*, 4*/), + high_input = and_v(in_x_shifted, low_mask_unpacked), + + mul_low_part = shuffle_epi8_v(low_vector, low_input), + mul_high_part = shuffle_epi8_v(high_vector, high_input), + + new = xor_v(mul_low_part, mul_high_part), + result = modifier(new, old); + + return result; +} + +static ALWAYS_INLINE PROTO_RETURN reedsolomon_gal_mul_impl( + PROTO_ARGS, + v (*modifier)(const v new, const v old)) { + const v low_mask_unpacked = set1_epi8_v(0x0f); + + const v128 low_vector128 = loadu_v128(low), + high_vector128 = loadu_v128(high); + const v low_vector = replicate_v128_v(low_vector128), + high_vector = replicate_v128_v(high_vector128); + + size_t done = 0; + +#if USE_ALIGNED_ACCESS +# define LOAD(addr) load_v(addr) +# define STORE(addr, vec) store_v(addr, vec) +#else +# define LOAD(addr) loadu_v(addr) +# define STORE(addr, vec) storeu_v(addr, vec) +#endif + +#if RS_HAVE_CLANG_LOOP_UNROLL +# pragma clang loop unroll(enable) +#endif + for(size_t x = 0; x < len / sizeof(v); x++) { + const v in_x = LOAD(&in[done]), + old = LOAD(&out[done]), + result = reedsolomon_gal_mul_v( + low_mask_unpacked, + low_vector, high_vector, + modifier, + in_x, + old); + + STORE(&out[done], result); + + done += sizeof(v); + } + + return done; +} + +static ALWAYS_INLINE CONST_FUNCTION v noop(const v new, const v old __attribute__((__unused__))) { + return new; +} + +#ifdef HOT +HOT_FUNCTION +#endif +FORCE_ALIGN_ARG_POINTER PROTO(reedsolomon_gal_mul) { + return reedsolomon_gal_mul_impl(low, high, in, out, len, noop); +} + +#ifdef HOT +HOT_FUNCTION +#endif +FORCE_ALIGN_ARG_POINTER PROTO(reedsolomon_gal_mul_xor) { + return reedsolomon_gal_mul_impl(low, high, in, out, len, xor_v); +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h new file mode 100644 index 000000000..4bd9ec0e9 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h @@ -0,0 +1,54 @@ +/* reedsolomon.h - SIMD-optimized Galois-field multiplication routines + * + * Copyright (c) 2015, 2016 Nicolas Trangez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE + */ + +#include + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#define PROTO_RETURN size_t +#define PROTO_ARGS \ + const uint8_t low[16], \ + const uint8_t high[16], \ + const uint8_t *restrict const in, \ + uint8_t *restrict const out, \ + const size_t len +#define PROTO(name) \ + PROTO_RETURN \ + name (PROTO_ARGS) + +PROTO(reedsolomon_gal_mul); +PROTO(reedsolomon_gal_mul_xor); + +typedef enum { + REEDSOLOMON_CPU_GENERIC = 0, + REEDSOLOMON_CPU_SSE2 = 1, + REEDSOLOMON_CPU_SSSE3 = 2, + REEDSOLOMON_CPU_AVX = 3, + REEDSOLOMON_CPU_AVX2 = 4, + REEDSOLOMON_CPU_NEON = 5, + REEDSOLOMON_CPU_ALTIVEC = 6, +} reedsolomon_cpu_support; + +reedsolomon_cpu_support reedsolomon_determine_cpu_support(void); diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/core.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/core.rs new file mode 100644 index 000000000..57733f588 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/core.rs @@ -0,0 +1,927 @@ +extern crate alloc; + +use alloc::sync::Arc; +use alloc::vec; +use alloc::vec::Vec; +use core::num::NonZeroUsize; + +use smallvec::SmallVec; + +use crate::errors::Error; +use crate::errors::SBSError; + +use crate::matrix::Matrix; + +use lru::LruCache; + +#[cfg(feature = "std")] +use parking_lot::Mutex; +#[cfg(not(feature = "std"))] +use spin::Mutex; + +use super::Field; +use super::ReconstructShard; + +const DATA_DECODE_MATRIX_CACHE_CAPACITY: usize = 254; + +// /// Parameters for parallelism. +// #[derive(PartialEq, Debug, Clone, Copy)] +// pub struct ParallelParam { +// /// Number of bytes to split the slices into for computations +// /// which can be done in parallel. +// /// +// /// Default is 32768. +// pub bytes_per_encode: usize, +// } + +// impl ParallelParam { +// /// Create a new `ParallelParam` with the given split arity. +// pub fn new(bytes_per_encode: usize) -> ParallelParam { +// ParallelParam { bytes_per_encode } +// } +// } + +// impl Default for ParallelParam { +// fn default() -> Self { +// ParallelParam::new(32768) +// } +// } + +/// Bookkeeper for shard by shard encoding. +/// +/// This is useful for avoiding incorrect use of +/// `encode_single` and `encode_single_sep` +/// +/// # Use cases +/// +/// Shard by shard encoding is useful for streamed data encoding +/// where you do not have all the needed data shards immediately, +/// but you want to spread out the encoding workload rather than +/// doing the encoding after everything is ready. +/// +/// A concrete example would be network packets encoding, +/// where encoding packet by packet as you receive them may be more efficient +/// than waiting for N packets then encode them all at once. +/// +/// # Example +/// +/// ``` +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # use reed_solomon_erasure::*; +/// # fn main () { +/// use reed_solomon_erasure::galois_8::Field; +/// let r: ReedSolomon = ReedSolomon::new(3, 2).unwrap(); +/// +/// let mut sbs = ShardByShard::new(&r); +/// +/// let mut shards = shards!([0u8, 1, 2, 3, 4], +/// [5, 6, 7, 8, 9], +/// // say we don't have the 3rd data shard yet +/// // and we want to fill it in later +/// [0, 0, 0, 0, 0], +/// [0, 0, 0, 0, 0], +/// [0, 0, 0, 0, 0]); +/// +/// // encode 1st and 2nd data shard +/// sbs.encode(&mut shards).unwrap(); +/// sbs.encode(&mut shards).unwrap(); +/// +/// // fill in 3rd data shard +/// shards[2][0] = 10.into(); +/// shards[2][1] = 11.into(); +/// shards[2][2] = 12.into(); +/// shards[2][3] = 13.into(); +/// shards[2][4] = 14.into(); +/// +/// // now do the encoding +/// sbs.encode(&mut shards).unwrap(); +/// +/// assert!(r.verify(&shards).unwrap()); +/// # } +/// ``` +#[derive(PartialEq, Debug)] +pub struct ShardByShard<'a, F: 'a + Field> { + codec: &'a ReedSolomon, + cur_input: usize, +} + +impl<'a, F: 'a + Field> ShardByShard<'a, F> { + /// Creates a new instance of the bookkeeping struct. + pub fn new(codec: &'a ReedSolomon) -> ShardByShard<'a, F> { + ShardByShard { + codec, + cur_input: 0, + } + } + + /// Checks if the parity shards are ready to use. + pub fn parity_ready(&self) -> bool { + self.cur_input == self.codec.data_shard_count + } + + /// Resets the bookkeeping data. + /// + /// You should call this when you have added and encoded + /// all data shards, and have finished using the parity shards. + /// + /// Returns `SBSError::LeftoverShards` when there are shards encoded + /// but parity shards are not ready to use. + pub fn reset(&mut self) -> Result<(), SBSError> { + if self.cur_input > 0 && !self.parity_ready() { + return Err(SBSError::LeftoverShards); + } + + self.cur_input = 0; + + Ok(()) + } + + /// Resets the bookkeeping data without checking. + pub fn reset_force(&mut self) { + self.cur_input = 0; + } + + /// Returns the current input shard index. + pub fn cur_input_index(&self) -> usize { + self.cur_input + } + + fn return_ok_and_incre_cur_input(&mut self) -> Result<(), SBSError> { + self.cur_input += 1; + Ok(()) + } + + fn sbs_encode_checks + AsMut<[F::Elem]>>( + &mut self, + slices: &mut [U], + ) -> Result<(), SBSError> { + let internal_checks = |codec: &ReedSolomon, data: &mut [U]| { + check_piece_count!(all => codec, data); + check_slices!(multi => data); + + Ok(()) + }; + + if self.parity_ready() { + return Err(SBSError::TooManyCalls); + } + + match internal_checks(self.codec, slices) { + Ok(()) => Ok(()), + Err(e) => Err(SBSError::RSError(e)), + } + } + + fn sbs_encode_sep_checks, U: AsRef<[F::Elem]> + AsMut<[F::Elem]>>( + &mut self, + data: &[T], + parity: &mut [U], + ) -> Result<(), SBSError> { + let internal_checks = |codec: &ReedSolomon, data: &[T], parity: &mut [U]| { + check_piece_count!(data => codec, data); + check_piece_count!(parity => codec, parity); + check_slices!(multi => data, multi => parity); + + Ok(()) + }; + + if self.parity_ready() { + return Err(SBSError::TooManyCalls); + } + + match internal_checks(self.codec, data, parity) { + Ok(()) => Ok(()), + Err(e) => Err(SBSError::RSError(e)), + } + } + + /// Constructs the parity shards partially using the current input data shard. + /// + /// Returns `SBSError::TooManyCalls` when all input data shards + /// have already been filled in via `encode` + pub fn encode(&mut self, mut shards: T) -> Result<(), SBSError> + where + T: AsRef<[U]> + AsMut<[U]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + let shards = shards.as_mut(); + self.sbs_encode_checks(shards)?; + + self.codec.encode_single(self.cur_input, shards).unwrap(); + + self.return_ok_and_incre_cur_input() + } + + /// Constructs the parity shards partially using the current input data shard. + /// + /// Returns `SBSError::TooManyCalls` when all input data shards + /// have already been filled in via `encode` + pub fn encode_sep, U: AsRef<[F::Elem]> + AsMut<[F::Elem]>>( + &mut self, + data: &[T], + parity: &mut [U], + ) -> Result<(), SBSError> { + self.sbs_encode_sep_checks(data, parity)?; + + self.codec + .encode_single_sep(self.cur_input, data[self.cur_input].as_ref(), parity) + .unwrap(); + + self.return_ok_and_incre_cur_input() + } +} + +/// Reed-Solomon erasure code encoder/decoder. +/// +/// # Common error handling +/// +/// ## For `encode`, `encode_shards`, `verify`, `verify_shards`, `reconstruct`, `reconstruct_data`, `reconstruct_shards`, `reconstruct_data_shards` +/// +/// Return `Error::TooFewShards` or `Error::TooManyShards` +/// when the number of provided shards +/// does not match the codec's one. +/// +/// Return `Error::EmptyShard` when the first shard provided is +/// of zero length. +/// +/// Return `Error::IncorrectShardSize` when the provided shards +/// are of different lengths. +/// +/// ## For `reconstruct`, `reconstruct_data`, `reconstruct_shards`, `reconstruct_data_shards` +/// +/// Return `Error::TooFewShardsPresent` when there are not +/// enough shards for reconstruction. +/// +/// Return `Error::InvalidShardFlags` when the number of flags does not match +/// the total number of shards. +/// +/// # Variants of encoding methods +/// +/// ## `sep` +/// +/// Methods ending in `_sep` takes an immutable reference to data shards, +/// and a mutable reference to parity shards. +/// +/// They are useful as they do not need to borrow the data shards mutably, +/// and other work that only needs read-only access to data shards can be done +/// in parallel/concurrently during the encoding. +/// +/// Following is a table of all the `sep` variants +/// +/// | not `sep` | `sep` | +/// | --- | --- | +/// | `encode_single` | `encode_single_sep` | +/// | `encode` | `encode_sep` | +/// +/// The `sep` variants do similar checks on the provided data shards and +/// parity shards. +/// +/// Return `Error::TooFewDataShards`, `Error::TooManyDataShards`, +/// `Error::TooFewParityShards`, or `Error::TooManyParityShards` when applicable. +/// +/// ## `single` +/// +/// Methods containing `single` facilitate shard by shard encoding, where +/// the parity shards are partially constructed using one data shard at a time. +/// See `ShardByShard` struct for more details on how shard by shard encoding +/// can be useful. +/// +/// They are prone to **misuse**, and it is recommended to use the `ShardByShard` +/// bookkeeping struct instead for shard by shard encoding. +/// +/// The ones that are also `sep` are **ESPECIALLY** prone to **misuse**. +/// Only use them when you actually need the flexibility. +/// +/// Following is a table of all the shard by shard variants +/// +/// | all shards at once | shard by shard | +/// | --- | --- | +/// | `encode` | `encode_single` | +/// | `encode_sep` | `encode_single_sep` | +/// +/// The `single` variants do similar checks on the provided data shards and parity shards, +/// and also do index check on `i_data`. +/// +/// Return `Error::InvalidIndex` if `i_data >= data_shard_count`. +/// +/// # Encoding behaviour +/// ## For `encode` +/// +/// You do not need to clear the parity shards beforehand, as the methods +/// will overwrite them completely. +/// +/// ## For `encode_single`, `encode_single_sep` +/// +/// Calling them with `i_data` being `0` will overwrite the parity shards +/// completely. If you are using the methods correctly, then you do not need +/// to clear the parity shards beforehand. +/// +/// # Variants of verifying methods +/// +/// `verify` allocate sa buffer on the heap of the same size +/// as the parity shards, and encode the input once using the buffer to store +/// the computed parity shards, then check if the provided parity shards +/// match the computed ones. +/// +/// `verify_with_buffer`, allows you to provide +/// the buffer to avoid making heap allocation(s) for the buffer in every call. +/// +/// The `with_buffer` variants also guarantee that the buffer contains the correct +/// parity shards if the result is `Ok(_)` (i.e. it does not matter whether the +/// verification passed or not, as long as the result is not an error, the buffer +/// will contain the correct parity shards after the call). +/// +/// Following is a table of all the `with_buffer` variants +/// +/// | not `with_buffer` | `with_buffer` | +/// | --- | --- | +/// | `verify` | `verify_with_buffer` | +/// +/// The `with_buffer` variants also check the dimensions of the buffer and return +/// `Error::TooFewBufferShards`, `Error::TooManyBufferShards`, `Error::EmptyShard`, +/// or `Error::IncorrectShardSize` when applicable. +/// +#[derive(Debug)] +pub struct ReedSolomon { + data_shard_count: usize, + parity_shard_count: usize, + total_shard_count: usize, + matrix: Matrix, + data_decode_matrix_cache: Mutex, Arc>>>, +} + +impl Clone for ReedSolomon { + fn clone(&self) -> ReedSolomon { + ReedSolomon::new(self.data_shard_count, self.parity_shard_count) + .expect("basic checks already passed as precondition of existence of self") + } +} + +impl PartialEq for ReedSolomon { + fn eq(&self, rhs: &ReedSolomon) -> bool { + self.data_shard_count == rhs.data_shard_count + && self.parity_shard_count == rhs.parity_shard_count + } +} + +impl ReedSolomon { + // AUDIT + // + // Error detection responsibilities + // + // Terminologies and symbols: + // X =A, B, C=> Y: X delegates error checking responsibilities A, B, C to Y + // X:= A, B, C: X needs to handle responsibilities A, B, C + // + // Encode methods + // + // `encode_single`:= + // - check index `i_data` within range [0, data shard count) + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // `encode_single_sep`:= + // - check index `i_data` within range [0, data shard count) + // - check length of `parity` matches parity shard count exactly + // - check consistency of length of individual parity slices + // - check length of `single_data` matches length of first parity slice + // `encode`:= + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // `encode_sep`:= + // - check length of `data` matches data shard count exactly + // - check length of `parity` matches parity shard count exactly + // - check consistency of length of individual data slices + // - check consistency of length of individual parity slices + // - check length of first parity slice matches length of first data slice + // + // Verify methods + // + // `verify`:= + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // + // Generates buffer then passes control to verify_with_buffer + // + // `verify_with_buffer`:= + // - check length of `slices` matches total shard count exactly + // - check length of `buffer` matches parity shard count exactly + // - check consistency of length of individual slices + // - check consistency of length of individual slices in buffer + // - check length of first slice in buffer matches length of first slice + // + // Reconstruct methods + // + // `reconstruct` =ALL=> `reconstruct_internal` + // `reconstruct_data`=ALL=> `reconstruct_internal` + // `reconstruct_internal`:= + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // - check length of `slice_present` matches length of `slices` + + fn get_parity_rows(&self) -> SmallVec<[&[F::Elem]; 32]> { + let mut parity_rows = SmallVec::with_capacity(self.parity_shard_count); + let matrix = &self.matrix; + for i in self.data_shard_count..self.total_shard_count { + parity_rows.push(matrix.get_row(i)); + } + + parity_rows + } + + fn build_matrix(data_shards: usize, total_shards: usize) -> Matrix { + let vandermonde = Matrix::vandermonde(total_shards, data_shards); + + let top = vandermonde.sub_matrix(0, 0, data_shards, data_shards); + + vandermonde.multiply(&top.invert().unwrap()) + } + + /// Creates a new instance of Reed-Solomon erasure code encoder/decoder. + /// + /// Returns `Error::TooFewDataShards` if `data_shards == 0`. + /// + /// Returns `Error::TooFewParityShards` if `parity_shards == 0`. + /// + /// Returns `Error::TooManyShards` if `data_shards + parity_shards > F::ORDER`. + pub fn new(data_shards: usize, parity_shards: usize) -> Result, Error> { + if data_shards == 0 { + return Err(Error::TooFewDataShards); + } + if parity_shards == 0 { + return Err(Error::TooFewParityShards); + } + if data_shards + parity_shards > F::ORDER { + return Err(Error::TooManyShards); + } + + let total_shards = data_shards + parity_shards; + + let matrix = Self::build_matrix(data_shards, total_shards); + + Ok(ReedSolomon { + data_shard_count: data_shards, + parity_shard_count: parity_shards, + total_shard_count: total_shards, + matrix, + data_decode_matrix_cache: Mutex::new(LruCache::new( + NonZeroUsize::new(DATA_DECODE_MATRIX_CACHE_CAPACITY).unwrap(), + )), + }) + } + + pub fn data_shard_count(&self) -> usize { + self.data_shard_count + } + + pub fn parity_shard_count(&self) -> usize { + self.parity_shard_count + } + + pub fn total_shard_count(&self) -> usize { + self.total_shard_count + } + + fn code_some_slices, U: AsMut<[F::Elem]>>( + &self, + matrix_rows: &[&[F::Elem]], + inputs: &[T], + outputs: &mut [U], + ) { + for i_input in 0..self.data_shard_count { + self.code_single_slice(matrix_rows, i_input, inputs[i_input].as_ref(), outputs); + } + } + + fn code_single_slice>( + &self, + matrix_rows: &[&[F::Elem]], + i_input: usize, + input: &[F::Elem], + outputs: &mut [U], + ) { + outputs.iter_mut().enumerate().for_each(|(i_row, output)| { + let matrix_row_to_use = matrix_rows[i_row][i_input]; + let output = output.as_mut(); + + if i_input == 0 { + F::mul_slice(matrix_row_to_use, input, output); + } else { + F::mul_slice_add(matrix_row_to_use, input, output); + } + }) + } + + fn check_some_slices_with_buffer( + &self, + matrix_rows: &[&[F::Elem]], + inputs: &[T], + to_check: &[T], + buffer: &mut [U], + ) -> bool + where + T: AsRef<[F::Elem]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + self.code_some_slices(matrix_rows, inputs, buffer); + + let at_least_one_mismatch_present = buffer + .iter_mut() + .enumerate() + .map(|(i, expected_parity_shard)| { + expected_parity_shard.as_ref() == to_check[i].as_ref() + }) + .any(|x| !x); // find the first false (some slice is different from the expected one) + !at_least_one_mismatch_present + } + + /// Constructs the parity shards partially using only the data shard + /// indexed by `i_data`. + /// + /// The slots where the parity shards sit at will be overwritten. + /// + /// # Warning + /// + /// You must apply this method on the data shards in strict sequential order (0..data shard count), + /// otherwise the parity shards will be incorrect. + /// + /// It is recommended to use the `ShardByShard` bookkeeping struct instead of this method directly. + pub fn encode_single(&self, i_data: usize, mut shards: T) -> Result<(), Error> + where + T: AsRef<[U]> + AsMut<[U]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + let slices = shards.as_mut(); + + check_slice_index!(data => self, i_data); + check_piece_count!(all=> self, slices); + check_slices!(multi => slices); + + // Get the slice of output buffers. + let (mut_input, output) = slices.split_at_mut(self.data_shard_count); + + let input = mut_input[i_data].as_ref(); + + self.encode_single_sep(i_data, input, output) + } + + /// Constructs the parity shards partially using only the data shard provided. + /// + /// The data shard must match the index `i_data`. + /// + /// The slots where the parity shards sit at will be overwritten. + /// + /// # Warning + /// + /// You must apply this method on the data shards in strict sequential order (0..data shard count), + /// otherwise the parity shards will be incorrect. + /// + /// It is recommended to use the `ShardByShard` bookkeeping struct instead of this method directly. + pub fn encode_single_sep + AsMut<[F::Elem]>>( + &self, + i_data: usize, + single_data: &[F::Elem], + parity: &mut [U], + ) -> Result<(), Error> { + check_slice_index!(data => self, i_data); + check_piece_count!(parity => self, parity); + check_slices!(multi => parity, single => single_data); + + let parity_rows = self.get_parity_rows(); + + // Do the coding. + self.code_single_slice(&parity_rows, i_data, single_data, parity); + + Ok(()) + } + + /// Constructs the parity shards. + /// + /// The slots where the parity shards sit at will be overwritten. + pub fn encode(&self, mut shards: T) -> Result<(), Error> + where + T: AsRef<[U]> + AsMut<[U]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + let slices: &mut [U] = shards.as_mut(); + + check_piece_count!(all => self, slices); + check_slices!(multi => slices); + + // Get the slice of output buffers. + let (input, output) = slices.split_at_mut(self.data_shard_count); + + self.encode_sep(&*input, output) + } + + /// Constructs the parity shards using a read-only view into the + /// data shards. + /// + /// The slots where the parity shards sit at will be overwritten. + pub fn encode_sep, U: AsRef<[F::Elem]> + AsMut<[F::Elem]>>( + &self, + data: &[T], + parity: &mut [U], + ) -> Result<(), Error> { + check_piece_count!(data => self, data); + check_piece_count!(parity => self, parity); + check_slices!(multi => data, multi => parity); + + let parity_rows = self.get_parity_rows(); + + // Do the coding. + self.code_some_slices(&parity_rows, data, parity); + + Ok(()) + } + + /// Checks if the parity shards are correct. + /// + /// This is a wrapper of `verify_with_buffer`. + pub fn verify>(&self, slices: &[T]) -> Result { + check_piece_count!(all => self, slices); + check_slices!(multi => slices); + + let slice_len = slices[0].as_ref().len(); + + let mut buffer: SmallVec<[Vec; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + + for _ in 0..self.parity_shard_count { + buffer.push(vec![F::zero(); slice_len]); + } + + self.verify_with_buffer(slices, &mut buffer) + } + + /// Checks if the parity shards are correct. + pub fn verify_with_buffer(&self, slices: &[T], buffer: &mut [U]) -> Result + where + T: AsRef<[F::Elem]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + check_piece_count!(all => self, slices); + check_piece_count!(parity_buf => self, buffer); + check_slices!(multi => slices, multi => buffer); + + let data = &slices[0..self.data_shard_count]; + let to_check = &slices[self.data_shard_count..]; + + let parity_rows = self.get_parity_rows(); + + Ok(self.check_some_slices_with_buffer(&parity_rows, data, to_check, buffer)) + } + + /// Reconstructs all shards. + /// + /// The shards marked not present are only overwritten when no error + /// is detected. All provided shards must have the same length. + /// + /// This means if the method returns an `Error`, then nothing is touched. + /// + /// `reconstruct`, `reconstruct_data`, `reconstruct_shards`, + /// `reconstruct_data_shards` share the same core code base. + pub fn reconstruct>(&self, slices: &mut [T]) -> Result<(), Error> { + self.reconstruct_internal(slices, false) + } + + /// Reconstructs only the data shards. + /// + /// The shards marked not present are only overwritten when no error + /// is detected. All provided shards must have the same length. + /// + /// This means if the method returns an `Error`, then nothing is touched. + /// + /// `reconstruct`, `reconstruct_data`, `reconstruct_shards`, + /// `reconstruct_data_shards` share the same core code base. + pub fn reconstruct_data>(&self, slices: &mut [T]) -> Result<(), Error> { + self.reconstruct_internal(slices, true) + } + + fn get_data_decode_matrix( + &self, + valid_indices: &[usize], + invalid_indices: &[usize], + ) -> Arc> { + { + let mut cache = self.data_decode_matrix_cache.lock(); + if let Some(entry) = cache.get(invalid_indices) { + return entry.clone(); + } + } + // Pull out the rows of the matrix that correspond to the shards that + // we have and build a square matrix. This matrix could be used to + // generate the shards that we have from the original data. + let mut sub_matrix = Matrix::new(self.data_shard_count, self.data_shard_count); + for (sub_matrix_row, &valid_index) in valid_indices.iter().enumerate() { + for c in 0..self.data_shard_count { + sub_matrix.set(sub_matrix_row, c, self.matrix.get(valid_index, c)); + } + } + // Invert the matrix, so we can go from the encoded shards back to the + // original data. Then pull out the row that generates the shard that + // we want to decode. Note that since this matrix maps back to the + // original data, it can be used to create a data shard, but not a + // parity shard. + let data_decode_matrix = Arc::new(sub_matrix.invert().unwrap()); + // Cache the inverted matrix for future use keyed on the indices of the + // invalid rows. + { + let data_decode_matrix = data_decode_matrix.clone(); + let mut cache = self.data_decode_matrix_cache.lock(); + cache.put(Vec::from(invalid_indices), data_decode_matrix); + } + data_decode_matrix + } + + fn reconstruct_internal>( + &self, + shards: &mut [T], + data_only: bool, + ) -> Result<(), Error> { + check_piece_count!(all => self, shards); + + let data_shard_count = self.data_shard_count; + + // Quick check: are all of the shards present? If so, there's + // nothing to do. + let mut number_present = 0; + let mut shard_len = None; + + for shard in shards.iter_mut() { + if let Some(len) = shard.len() { + if len == 0 { + return Err(Error::EmptyShard); + } + number_present += 1; + if let Some(old_len) = shard_len { + if len != old_len { + // mismatch between shards. + return Err(Error::IncorrectShardSize); + } + } + shard_len = Some(len); + } + } + + if number_present == self.total_shard_count { + // Cool. All of the shards are there. We don't + // need to do anything. + return Ok(()); + } + + // More complete sanity check + if number_present < data_shard_count { + return Err(Error::TooFewShardsPresent); + } + + let shard_len = shard_len.expect("at least one shard present; qed"); + + // Pull out an array holding just the shards that + // correspond to the rows of the submatrix. These shards + // will be the input to the decoding process that re-creates + // the missing data shards. + // + // Also, create an array of indices of the valid rows we do have + // and the invalid rows we don't have. + // + // The valid indices are used to construct the data decode matrix, + // the invalid indices are used to key the data decode matrix + // in the data decode matrix cache. + // + // We only need exactly N valid indices, where N = `data_shard_count`, + // as the data decode matrix is a N x N matrix, thus only needs + // N valid indices for determining the N rows to pick from + // `self.matrix`. + let mut sub_shards: SmallVec<[&[F::Elem]; 32]> = SmallVec::with_capacity(data_shard_count); + let mut missing_data_slices: SmallVec<[&mut [F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + let mut missing_parity_slices: SmallVec<[&mut [F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + let mut valid_indices: SmallVec<[usize; 32]> = SmallVec::with_capacity(data_shard_count); + let mut invalid_indices: SmallVec<[usize; 32]> = SmallVec::with_capacity(data_shard_count); + + // Separate the shards into groups + for (matrix_row, shard) in shards.iter_mut().enumerate() { + // get or initialize the shard so we can reconstruct in-place, + // but if we are only reconstructing data shard, + // do not initialize if the shard is not a data shard + let shard_data = if matrix_row >= data_shard_count && data_only { + shard.get().ok_or(None) + } else { + shard.get_or_initialize(shard_len).map_err(Some) + }; + + match shard_data { + Ok(shard) => { + if sub_shards.len() < data_shard_count { + sub_shards.push(shard); + valid_indices.push(matrix_row); + } else { + // Already have enough shards in `sub_shards` + // as we only need N shards, where N = `data_shard_count`, + // for the data decode matrix + // + // So nothing to do here + } + } + Err(None) => { + // the shard data is not meant to be initialized here, + // but we should still note it missing. + invalid_indices.push(matrix_row); + } + Err(Some(x)) => { + // initialized missing shard data. + let shard = x?; + if matrix_row < data_shard_count { + missing_data_slices.push(shard); + } else { + missing_parity_slices.push(shard); + } + + invalid_indices.push(matrix_row); + } + } + } + + let data_decode_matrix = self.get_data_decode_matrix(&valid_indices, &invalid_indices); + + // Re-create any data shards that were missing. + // + // The input to the coding is all of the shards we actually + // have, and the output is the missing data shards. The computation + // is done using the special decode matrix we just built. + let mut matrix_rows: SmallVec<[&[F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + + for i_slice in invalid_indices + .iter() + .cloned() + .take_while(|i| i < &data_shard_count) + { + matrix_rows.push(data_decode_matrix.get_row(i_slice)); + } + + self.code_some_slices(&matrix_rows, &sub_shards, &mut missing_data_slices); + + if data_only { + Ok(()) + } else { + // Now that we have all of the data shards intact, we can + // compute any of the parity that is missing. + // + // The input to the coding is ALL of the data shards, including + // any that we just calculated. The output is whichever of the + // parity shards were missing. + let mut matrix_rows: SmallVec<[&[F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + let parity_rows = self.get_parity_rows(); + + for i_slice in invalid_indices + .iter() + .cloned() + .skip_while(|i| i < &data_shard_count) + { + matrix_rows.push(parity_rows[i_slice - data_shard_count]); + } + { + // Gather up all the data shards. + // old data shards are in `sub_shards`, + // new ones are in `missing_data_slices`. + let mut i_old_data_slice = 0; + let mut i_new_data_slice = 0; + + let mut all_data_slices: SmallVec<[&[F::Elem]; 32]> = + SmallVec::with_capacity(data_shard_count); + + let mut next_maybe_good = 0; + let mut push_good_up_to = move |data_slices: &mut SmallVec<_>, up_to| { + // if next_maybe_good == up_to, this loop is a no-op. + for _ in next_maybe_good..up_to { + // push all good indices we just skipped. + data_slices.push(sub_shards[i_old_data_slice]); + i_old_data_slice += 1; + } + + next_maybe_good = up_to + 1; + }; + + for i_slice in invalid_indices + .iter() + .cloned() + .take_while(|i| i < &data_shard_count) + { + push_good_up_to(&mut all_data_slices, i_slice); + all_data_slices.push(missing_data_slices[i_new_data_slice]); + i_new_data_slice += 1; + } + push_good_up_to(&mut all_data_slices, data_shard_count); + + // Now do the actual computation for the missing + // parity shards + self.code_some_slices(&matrix_rows, &all_data_slices, &mut missing_parity_slices); + } + + Ok(()) + } + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs new file mode 100644 index 000000000..761343685 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs @@ -0,0 +1,158 @@ +use core::fmt::Formatter; + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum Error { + TooFewShards, + TooManyShards, + TooFewDataShards, + TooManyDataShards, + TooFewParityShards, + TooManyParityShards, + TooFewBufferShards, + TooManyBufferShards, + IncorrectShardSize, + TooFewShardsPresent, + EmptyShard, + InvalidShardFlags, + InvalidIndex, +} + +impl Error { + fn to_string(&self) -> &str { + match *self { + Error::TooFewShards=> "The number of provided shards is smaller than the one in codec", + Error::TooManyShards => "The number of provided shards is greater than the one in codec", + Error::TooFewDataShards => "The number of provided data shards is smaller than the one in codec", + Error::TooManyDataShards => "The number of provided data shards is greater than the one in codec", + Error::TooFewParityShards => "The number of provided parity shards is smaller than the one in codec", + Error::TooManyParityShards => "The number of provided parity shards is greater than the one in codec", + Error::TooFewBufferShards => "The number of provided buffer shards is smaller than the number of parity shards in codec", + Error::TooManyBufferShards => "The number of provided buffer shards is greater than the number of parity shards in codec", + Error::IncorrectShardSize => "At least one of the provided shards is not of the correct size", + Error::TooFewShardsPresent => "The number of shards present is smaller than number of parity shards, cannot reconstruct missing shards", + Error::EmptyShard => "The first shard provided is of zero length", + Error::InvalidShardFlags => "The number of flags does not match the total number of shards", + Error::InvalidIndex => "The data shard index provided is greater or equal to the number of data shards in codec", + } + } +} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut Formatter) -> Result<(), core::fmt::Error> { + write!(f, "{}", self.to_string()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error { + fn description(&self) -> &str { + self.to_string() + } +} + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum SBSError { + TooManyCalls, + LeftoverShards, + RSError(Error), +} + +impl SBSError { + fn to_string(&self) -> &str { + match *self { + SBSError::TooManyCalls => "Too many calls", + SBSError::LeftoverShards => "Leftover shards", + SBSError::RSError(ref e) => e.to_string(), + } + } +} + +impl core::fmt::Display for SBSError { + fn fmt(&self, f: &mut Formatter) -> Result<(), core::fmt::Error> { + write!(f, "{}", self.to_string()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SBSError { + fn description(&self) -> &str { + self.to_string() + } +} + +#[cfg(test)] +mod tests { + use crate::errors::Error; + use crate::errors::SBSError; + + #[test] + fn test_error_to_string_is_okay() { + assert_eq!( + Error::TooFewShards.to_string(), + "The number of provided shards is smaller than the one in codec" + ); + assert_eq!( + Error::TooManyShards.to_string(), + "The number of provided shards is greater than the one in codec" + ); + assert_eq!( + Error::TooFewDataShards.to_string(), + "The number of provided data shards is smaller than the one in codec" + ); + assert_eq!( + Error::TooManyDataShards.to_string(), + "The number of provided data shards is greater than the one in codec" + ); + assert_eq!( + Error::TooFewParityShards.to_string(), + "The number of provided parity shards is smaller than the one in codec" + ); + assert_eq!( + Error::TooManyParityShards.to_string(), + "The number of provided parity shards is greater than the one in codec" + ); + assert_eq!( + Error::TooFewBufferShards.to_string(), + "The number of provided buffer shards is smaller than the number of parity shards in codec" + ); + assert_eq!( + Error::TooManyBufferShards.to_string(), + "The number of provided buffer shards is greater than the number of parity shards in codec" + ); + assert_eq!( + Error::IncorrectShardSize.to_string(), + "At least one of the provided shards is not of the correct size" + ); + assert_eq!(Error::TooFewShardsPresent.to_string(), "The number of shards present is smaller than number of parity shards, cannot reconstruct missing shards"); + assert_eq!( + Error::EmptyShard.to_string(), + "The first shard provided is of zero length" + ); + assert_eq!( + Error::InvalidShardFlags.to_string(), + "The number of flags does not match the total number of shards" + ); + assert_eq!( + Error::InvalidIndex.to_string(), + "The data shard index provided is greater or equal to the number of data shards in codec" + ); + } + + #[test] + fn test_sbserror_to_string_is_okay() { + assert_eq!(SBSError::TooManyCalls.to_string(), "Too many calls"); + assert_eq!(SBSError::LeftoverShards.to_string(), "Leftover shards"); + } + + #[cfg(feature = "std")] + #[test] + fn test_error_display_does_not_panic() { + println!("{}", Error::TooFewShards); + } + + #[cfg(feature = "std")] + #[test] + fn test_sbserror_display_does_not_panic() { + println!("{}", SBSError::TooManyCalls); + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs new file mode 100644 index 000000000..500ac8d2a --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs @@ -0,0 +1,412 @@ +//! GF(2^16) implementation. +//! +//! More accurately, this is a `GF((2^8)^2)` implementation which builds an extension +//! field of `GF(2^8)`, as defined in the `galois_8` module. + +use crate::galois_8; +use core::ops::{Add, Div, Mul, Sub}; + +// the irreducible polynomial used as a modulus for the field. +// print R.irreducible_element(2,algorithm="first_lexicographic" ) +// x^2 + a*x + a^7 +// +// hopefully it is a fast polynomial +const EXT_POLY: [u8; 3] = [1, 2, 128]; + +/// The field GF(2^16). +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct Field; + +impl crate::Field for Field { + const ORDER: usize = 65536; + + type Elem = [u8; 2]; + + fn add(a: [u8; 2], b: [u8; 2]) -> [u8; 2] { + (Element(a) + Element(b)).0 + } + + fn mul(a: [u8; 2], b: [u8; 2]) -> [u8; 2] { + (Element(a) * Element(b)).0 + } + + fn div(a: [u8; 2], b: [u8; 2]) -> [u8; 2] { + (Element(a) / Element(b)).0 + } + + fn exp(elem: [u8; 2], n: usize) -> [u8; 2] { + Element(elem).exp(n).0 + } + + fn zero() -> [u8; 2] { + [0; 2] + } + + fn one() -> [u8; 2] { + [0, 1] + } + + fn nth_internal(n: usize) -> [u8; 2] { + [(n >> 8) as u8, n as u8] + } +} + +/// Type alias of ReedSolomon over GF(2^8). +pub type ReedSolomon = crate::ReedSolomon; + +/// Type alias of ShardByShard over GF(2^8). +pub type ShardByShard<'a> = crate::ShardByShard<'a, Field>; + +/// An element of `GF(2^16)`. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +struct Element(pub [u8; 2]); + +impl Element { + // Create the zero element. + fn zero() -> Self { + Element([0, 0]) + } + + // A constant element evaluating to `n`. + fn constant(n: u8) -> Element { + Element([0, n]) + } + + // Whether this is the zero element. + fn is_zero(&self) -> bool { + self.0 == [0; 2] + } + + fn exp(mut self, n: usize) -> Element { + if n == 0 { + Element::constant(1) + } else if self == Element::zero() { + Element::zero() + } else { + let x = self; + for _ in 1..n { + self = self * x; + } + + self + } + } + + // reduces from some polynomial with degree <= 2. + #[inline] + fn reduce_from(mut x: [u8; 3]) -> Self { + if x[0] != 0 { + // divide x by EXT_POLY and use remainder. + // i = 0 here. + // c*x^(i+j) = a*x^i*b*x^j + x[1] ^= galois_8::mul(EXT_POLY[1], x[0]); + x[2] ^= galois_8::mul(EXT_POLY[2], x[0]); + } + + Element([x[1], x[2]]) + } + + fn degree(&self) -> usize { + if self.0[0] != 0 { + 1 + } else { + 0 + } + } +} + +impl From<[u8; 2]> for Element { + fn from(c: [u8; 2]) -> Self { + Element(c) + } +} + +impl Default for Element { + fn default() -> Self { + Element::zero() + } +} + +impl Add for Element { + type Output = Element; + + fn add(self, other: Self) -> Element { + Element([self.0[0] ^ other.0[0], self.0[1] ^ other.0[1]]) + } +} + +impl Sub for Element { + type Output = Element; + + fn sub(self, other: Self) -> Element { + self.add(other) + } +} + +impl Mul for Element { + type Output = Element; + + fn mul(self, rhs: Self) -> Element { + // FOIL; our elements are linear at most, with two coefficients + let out: [u8; 3] = [ + galois_8::mul(self.0[0], rhs.0[0]), + galois_8::add( + galois_8::mul(self.0[1], rhs.0[0]), + galois_8::mul(self.0[0], rhs.0[1]), + ), + galois_8::mul(self.0[1], rhs.0[1]), + ]; + + Element::reduce_from(out) + } +} + +impl Mul for Element { + type Output = Element; + + fn mul(self, rhs: u8) -> Element { + Element([galois_8::mul(rhs, self.0[0]), galois_8::mul(rhs, self.0[1])]) + } +} + +impl Div for Element { + type Output = Element; + + fn div(self, rhs: Self) -> Element { + self * rhs.inverse() + } +} + +// helpers for division. + +#[derive(Debug)] +enum EgcdRhs { + Element(Element), + ExtPoly, +} + +impl Element { + // compute extended euclidean algorithm against an element of self, + // where the GCD is known to be constant. + fn const_egcd(self, rhs: EgcdRhs) -> (u8, Element, Element) { + if self.is_zero() { + let rhs = match rhs { + EgcdRhs::Element(elem) => elem, + EgcdRhs::ExtPoly => panic!("const_egcd invoked with divisible"), + }; + (rhs.0[1], Element::constant(0), Element::constant(1)) + } else { + let (cur_quotient, cur_remainder) = match rhs { + EgcdRhs::Element(rhs) => rhs.polynom_div(self), + EgcdRhs::ExtPoly => Element::div_ext_by(self), + }; + + // GCD is constant because EXT_POLY is irreducible + let (g, x, y) = cur_remainder.const_egcd(EgcdRhs::Element(self)); + (g, y + (cur_quotient * x), x) + } + } + + // divide EXT_POLY by self. + fn div_ext_by(rhs: Self) -> (Element, Element) { + if rhs.degree() == 0 { + // dividing by constant is the same as multiplying by another constant. + // and all constant multiples of EXT_POLY are in the equivalence class + // of 0. + return (Element::zero(), Element::zero()); + } + + // divisor is ensured linear here. + // now ensure divisor is monic. + let leading_mul_inv = galois_8::div(1, rhs.0[0]); + + let monictized = rhs * leading_mul_inv; + let mut poly = EXT_POLY; + + for i in 0..2 { + let coef = poly[i]; + for j in 1..2 { + if rhs.0[j] != 0 { + poly[i + j] ^= galois_8::mul(monictized.0[j], coef); + } + } + } + + let remainder = Element::constant(poly[2]); + let quotient = Element([poly[0], poly[1]]) * leading_mul_inv; + + (quotient, remainder) + } + + fn polynom_div(self, rhs: Self) -> (Element, Element) { + let divisor_degree = rhs.degree(); + if rhs.is_zero() { + panic!("divide by 0"); + } else if self.degree() < divisor_degree { + // If divisor's degree (len-1) is bigger, all dividend is a remainder + (Element::zero(), self) + } else if divisor_degree == 0 { + // divide by constant. + let invert = galois_8::div(1, rhs.0[1]); + let quotient = Element([ + galois_8::mul(invert, self.0[0]), + galois_8::mul(invert, self.0[1]), + ]); + + (quotient, Element::zero()) + } else { + // self degree is at least divisor degree, divisor degree not 0. + // therefore both are 1. + debug_assert_eq!(self.degree(), divisor_degree); + debug_assert_eq!(self.degree(), 1); + + // ensure rhs is constant. + let leading_mul_inv = galois_8::div(1, rhs.0[0]); + let monic = Element([ + galois_8::mul(leading_mul_inv, rhs.0[0]), + galois_8::mul(leading_mul_inv, rhs.0[1]), + ]); + + let leading_coeff = self.0[0]; + let mut remainder = self.0[1]; + + if monic.0[1] != 0 { + remainder ^= galois_8::mul(monic.0[1], self.0[0]); + } + + ( + Element::constant(galois_8::mul(leading_mul_inv, leading_coeff)), + Element::constant(remainder), + ) + } + } + + /// Convert the inverse of this field element. Panics if zero. + fn inverse(self) -> Element { + if self.is_zero() { + panic!("Cannot invert 0"); + } + + // first step of extended euclidean algorithm. + // done here because EXT_POLY is outside the scope of `Element`. + let (gcd, y) = { + // self / EXT_POLY = (0, self) + let remainder = self; + + // GCD is constant because EXT_POLY is irreducible + let (g, x, _) = remainder.const_egcd(EgcdRhs::ExtPoly); + + (g, x) + }; + + // we still need to normalize it by dividing by the gcd + if gcd != 0 { + // EXT_POLY is irreducible so the GCD will always be constant. + // EXT_POLY*x + self*y = gcd + // self*y = gcd - EXT_POLY*x + // + // EXT_POLY*x is representative of the equivalence class of 0. + let normalizer = galois_8::div(1, gcd); + y * normalizer + } else { + // self is equivalent to zero. + panic!("Cannot invert 0"); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::Arbitrary; + + impl Arbitrary for Element { + fn arbitrary(gen: &mut G) -> Self { + let a = u8::arbitrary(gen); + let b = u8::arbitrary(gen); + + Element([a, b]) + } + } + + quickcheck! { + fn qc_add_associativity(a: Element, b: Element, c: Element) -> bool { + a + (b + c) == (a + b) + c + } + + fn qc_mul_associativity(a: Element, b: Element, c: Element) -> bool { + a * (b * c) == (a * b) * c + } + + fn qc_additive_identity(a: Element) -> bool { + let zero = Element::zero(); + a - (zero - a) == zero + } + + fn qc_multiplicative_identity(a: Element) -> bool { + a.is_zero() || { + let one = Element([0, 1]); + (one / a) * a == one + } + } + + fn qc_add_commutativity(a: Element, b: Element) -> bool { + a + b == b + a + } + + fn qc_mul_commutativity(a: Element, b: Element) -> bool { + a * b == b * a + } + + fn qc_add_distributivity(a: Element, b: Element, c: Element) -> bool { + a * (b + c) == (a * b) + (a * c) + } + + fn qc_inverse(a: Element) -> bool { + a.is_zero() || { + let inv = a.inverse(); + a * inv == Element::constant(1) + } + } + + fn qc_exponent_1(a: Element, n: u8) -> bool { + a.is_zero() || n == 0 || { + let mut b = a.exp(n as usize); + for _ in 1..n { + b = b / a; + } + + a == b + } + } + + fn qc_exponent_2(a: Element, n: u8) -> bool { + a.is_zero() || { + let mut res = true; + let mut b = Element::constant(1); + + for i in 0..n { + res = res && b == a.exp(i as usize); + b = b * a; + } + + res + } + } + + fn qc_exp_zero_is_one(a: Element) -> bool { + a.exp(0) == Element::constant(1) + } + } + + #[test] + #[should_panic] + fn test_div_b_is_0() { + let _ = Element([1, 0]) / Element::zero(); + } + + #[test] + fn zero_to_zero_is_one() { + assert_eq!(Element::zero().exp(0), Element::constant(1)) + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs new file mode 100644 index 000000000..01adc09d9 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs @@ -0,0 +1,621 @@ +//! Implementation of GF(2^8): the finite field with 2^8 elements. + +include!(concat!(env!("OUT_DIR"), "/table.rs")); + +/// The field GF(2^8). +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct Field; + +impl crate::Field for Field { + const ORDER: usize = 256; + type Elem = u8; + + fn add(a: u8, b: u8) -> u8 { + add(a, b) + } + + fn mul(a: u8, b: u8) -> u8 { + mul(a, b) + } + + fn div(a: u8, b: u8) -> u8 { + div(a, b) + } + + fn exp(elem: u8, n: usize) -> u8 { + exp(elem, n) + } + + fn zero() -> u8 { + 0 + } + + fn one() -> u8 { + 1 + } + + fn nth_internal(n: usize) -> u8 { + n as u8 + } + + fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice(c, input, out) + } + + fn mul_slice_add(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice_xor(c, input, out) + } +} + +/// Type alias of ReedSolomon over GF(2^8). +pub type ReedSolomon = crate::ReedSolomon; + +/// Type alias of ShardByShard over GF(2^8). +pub type ShardByShard<'a> = crate::ShardByShard<'a, Field>; + +/// Add two elements. +pub fn add(a: u8, b: u8) -> u8 { + a ^ b +} + +/// Subtract `b` from `a`. +#[cfg(test)] +pub fn sub(a: u8, b: u8) -> u8 { + a ^ b +} + +/// Multiply two elements. +pub fn mul(a: u8, b: u8) -> u8 { + MUL_TABLE[a as usize][b as usize] +} + +/// Divide one element by another. `b`, the divisor, may not be 0. +pub fn div(a: u8, b: u8) -> u8 { + if a == 0 { + 0 + } else if b == 0 { + panic!("Divisor is 0") + } else { + let log_a = LOG_TABLE[a as usize]; + let log_b = LOG_TABLE[b as usize]; + let mut log_result = log_a as isize - log_b as isize; + if log_result < 0 { + log_result += 255; + } + EXP_TABLE[log_result as usize] + } +} + +/// Compute a^n. +pub fn exp(a: u8, n: usize) -> u8 { + if n == 0 { + 1 + } else if a == 0 { + 0 + } else { + let log_a = LOG_TABLE[a as usize]; + let mut log_result = log_a as usize * n; + while 255 <= log_result { + log_result -= 255; + } + EXP_TABLE[log_result] + } +} + +const PURE_RUST_UNROLL: isize = 4; + +macro_rules! return_if_empty { + ( + $len:expr + ) => { + if $len == 0 { + return; + } + }; +} + +#[cfg(not(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +)))] +pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice_pure_rust(c, input, out); +} + +#[cfg(not(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +)))] +pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice_xor_pure_rust(c, input, out); +} + +fn mul_slice_pure_rust(c: u8, input: &[u8], out: &mut [u8]) { + let mt = &MUL_TABLE[c as usize]; + let mt_ptr: *const u8 = &mt[0]; + + assert_eq!(input.len(), out.len()); + + let len: isize = input.len() as isize; + return_if_empty!(len); + + let mut input_ptr: *const u8 = &input[0]; + let mut out_ptr: *mut u8 = &mut out[0]; + + let mut n: isize = 0; + unsafe { + assert_eq!(4, PURE_RUST_UNROLL); + if len > PURE_RUST_UNROLL { + let len_minus_unroll = len - PURE_RUST_UNROLL; + while n < len_minus_unroll { + *out_ptr = *mt_ptr.offset(*input_ptr as isize); + *out_ptr.offset(1) = *mt_ptr.offset(*input_ptr.offset(1) as isize); + *out_ptr.offset(2) = *mt_ptr.offset(*input_ptr.offset(2) as isize); + *out_ptr.offset(3) = *mt_ptr.offset(*input_ptr.offset(3) as isize); + + input_ptr = input_ptr.offset(PURE_RUST_UNROLL); + out_ptr = out_ptr.offset(PURE_RUST_UNROLL); + n += PURE_RUST_UNROLL; + } + } + while n < len { + *out_ptr = *mt_ptr.offset(*input_ptr as isize); + + input_ptr = input_ptr.offset(1); + out_ptr = out_ptr.offset(1); + n += 1; + } + } + /* for n in 0..input.len() { + * out[n] = mt[input[n] as usize] + * } + */ +} + +fn mul_slice_xor_pure_rust(c: u8, input: &[u8], out: &mut [u8]) { + let mt = &MUL_TABLE[c as usize]; + let mt_ptr: *const u8 = &mt[0]; + + assert_eq!(input.len(), out.len()); + + let len: isize = input.len() as isize; + return_if_empty!(len); + + let mut input_ptr: *const u8 = &input[0]; + let mut out_ptr: *mut u8 = &mut out[0]; + + let mut n: isize = 0; + unsafe { + assert_eq!(4, PURE_RUST_UNROLL); + if len > PURE_RUST_UNROLL { + let len_minus_unroll = len - PURE_RUST_UNROLL; + while n < len_minus_unroll { + *out_ptr ^= *mt_ptr.offset(*input_ptr as isize); + *out_ptr.offset(1) ^= *mt_ptr.offset(*input_ptr.offset(1) as isize); + *out_ptr.offset(2) ^= *mt_ptr.offset(*input_ptr.offset(2) as isize); + *out_ptr.offset(3) ^= *mt_ptr.offset(*input_ptr.offset(3) as isize); + + input_ptr = input_ptr.offset(PURE_RUST_UNROLL); + out_ptr = out_ptr.offset(PURE_RUST_UNROLL); + n += PURE_RUST_UNROLL; + } + } + while n < len { + *out_ptr ^= *mt_ptr.offset(*input_ptr as isize); + + input_ptr = input_ptr.offset(1); + out_ptr = out_ptr.offset(1); + n += 1; + } + } + /* for n in 0..input.len() { + * out[n] ^= mt[input[n] as usize]; + * } + */ +} + +#[cfg(test)] +fn slice_xor(input: &[u8], out: &mut [u8]) { + assert_eq!(input.len(), out.len()); + + let len: isize = input.len() as isize; + return_if_empty!(len); + + let mut input_ptr: *const u8 = &input[0]; + let mut out_ptr: *mut u8 = &mut out[0]; + + let mut n: isize = 0; + unsafe { + assert_eq!(4, PURE_RUST_UNROLL); + if len > PURE_RUST_UNROLL { + let len_minus_unroll = len - PURE_RUST_UNROLL; + while n < len_minus_unroll { + *out_ptr ^= *input_ptr; + *out_ptr.offset(1) ^= *input_ptr.offset(1); + *out_ptr.offset(2) ^= *input_ptr.offset(2); + *out_ptr.offset(3) ^= *input_ptr.offset(3); + + input_ptr = input_ptr.offset(PURE_RUST_UNROLL); + out_ptr = out_ptr.offset(PURE_RUST_UNROLL); + n += PURE_RUST_UNROLL; + } + } + while n < len { + *out_ptr ^= *input_ptr; + + input_ptr = input_ptr.offset(1); + out_ptr = out_ptr.offset(1); + n += 1; + } + } + /* for n in 0..input.len() { + * out[n] ^= input[n] + * } + */ +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +extern "C" { + fn reedsolomon_gal_mul( + low: *const u8, + high: *const u8, + input: *const u8, + out: *mut u8, + len: libc::size_t, + ) -> libc::size_t; + + fn reedsolomon_gal_mul_xor( + low: *const u8, + high: *const u8, + input: *const u8, + out: *mut u8, + len: libc::size_t, + ) -> libc::size_t; +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) { + let low: *const u8 = &MUL_TABLE_LOW[c as usize][0]; + let high: *const u8 = &MUL_TABLE_HIGH[c as usize][0]; + + assert_eq!(input.len(), out.len()); + + let input_ptr: *const u8 = &input[0]; + let out_ptr: *mut u8 = &mut out[0]; + let size: libc::size_t = input.len(); + + let bytes_done: usize = + unsafe { reedsolomon_gal_mul(low, high, input_ptr, out_ptr, size) as usize }; + + mul_slice_pure_rust(c, &input[bytes_done..], &mut out[bytes_done..]); +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) { + let low: *const u8 = &MUL_TABLE_LOW[c as usize][0]; + let high: *const u8 = &MUL_TABLE_HIGH[c as usize][0]; + + assert_eq!(input.len(), out.len()); + + let input_ptr: *const u8 = &input[0]; + let out_ptr: *mut u8 = &mut out[0]; + let size: libc::size_t = input.len(); + + let bytes_done: usize = + unsafe { reedsolomon_gal_mul_xor(low, high, input_ptr, out_ptr, size) as usize }; + + mul_slice_xor_pure_rust(c, &input[bytes_done..], &mut out[bytes_done..]); +} + +#[cfg(test)] +mod tests { + extern crate alloc; + + use alloc::vec; + + use super::*; + use crate::tests::fill_random; + use rand; + + static BACKBLAZE_LOG_TABLE: [u8; 256] = [ + //-1, 0, 1, 25, 2, 50, 26, 198, + // first value is changed from -1 to 0 + 0, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, + 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47, 225, 36, 15, 33, 53, 147, + 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, + 228, 114, 166, 6, 191, 139, 98, 102, 221, 48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, + 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, + 195, 72, 126, 110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, + 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103, 74, 222, + 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, + 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135, 151, 178, 220, 252, 190, 97, 242, + 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, + 123, 164, 118, 196, 23, 73, 236, 127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, + 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, + 122, 117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, + 175, + ]; + + #[test] + fn log_table_same_as_backblaze() { + for i in 0..256 { + assert_eq!(LOG_TABLE[i], BACKBLAZE_LOG_TABLE[i]); + } + } + + #[test] + fn test_associativity() { + for a in 0..256 { + let a = a as u8; + for b in 0..256 { + let b = b as u8; + for c in 0..256 { + let c = c as u8; + let x = add(a, add(b, c)); + let y = add(add(a, b), c); + assert_eq!(x, y); + let x = mul(a, mul(b, c)); + let y = mul(mul(a, b), c); + assert_eq!(x, y); + } + } + } + } + + quickcheck! { + fn qc_add_associativity(a: u8, b: u8, c: u8) -> bool { + add(a, add(b, c)) == add(add(a, b), c) + } + + fn qc_mul_associativity(a: u8, b: u8, c: u8) -> bool { + mul(a, mul(b, c)) == mul(mul(a, b), c) + } + } + + #[test] + fn test_identity() { + for a in 0..256 { + let a = a as u8; + let b = sub(0, a); + let c = sub(a, b); + assert_eq!(c, 0); + if a != 0 { + let b = div(1, a); + let c = mul(a, b); + assert_eq!(c, 1); + } + } + } + + quickcheck! { + fn qc_additive_identity(a: u8) -> bool { + sub(a, sub(0, a)) == 0 + } + + fn qc_multiplicative_identity(a: u8) -> bool { + if a == 0 { true } + else { mul(a, div(1, a)) == 1 } + } + } + + #[test] + fn test_commutativity() { + for a in 0..256 { + let a = a as u8; + for b in 0..256 { + let b = b as u8; + let x = add(a, b); + let y = add(b, a); + assert_eq!(x, y); + let x = mul(a, b); + let y = mul(b, a); + assert_eq!(x, y); + } + } + } + + quickcheck! { + fn qc_add_commutativity(a: u8, b: u8) -> bool { + add(a, b) == add(b, a) + } + + fn qc_mul_commutativity(a: u8, b: u8) -> bool { + mul(a, b) == mul(b, a) + } + } + + #[test] + fn test_distributivity() { + for a in 0..256 { + let a = a as u8; + for b in 0..256 { + let b = b as u8; + for c in 0..256 { + let c = c as u8; + let x = mul(a, add(b, c)); + let y = add(mul(a, b), mul(a, c)); + assert_eq!(x, y); + } + } + } + } + + quickcheck! { + fn qc_add_distributivity(a: u8, b: u8, c: u8) -> bool { + mul(a, add(b, c)) == add(mul(a, b), mul(a, c)) + } + } + + #[test] + fn test_exp() { + for a in 0..256 { + let a = a as u8; + let mut power = 1u8; + for j in 0..256 { + let x = exp(a, j); + assert_eq!(x, power); + power = mul(power, a); + } + } + } + + #[test] + fn test_galois() { + assert_eq!(mul(3, 4), 12); + assert_eq!(mul(7, 7), 21); + assert_eq!(mul(23, 45), 41); + + let input = [ + 0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, + 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185, + ]; + let mut output1 = vec![0; input.len()]; + let mut output2 = vec![0; input.len()]; + mul_slice(25, &input, &mut output1); + let expect = [ + 0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, + 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, + 0x23, 0x3a, 0x75, 0x6c, 0x47, + ]; + for i in 0..input.len() { + assert_eq!(expect[i], output1[i]); + } + mul_slice(25, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect[i], output2[i]); + } + + let expect_xor = [ + 0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, + 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, + 0xcc, 0xe1, 0x22, 0xf, 0x78, + ]; + mul_slice_xor(52, &input, &mut output1); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output1[i]); + } + mul_slice_xor(52, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output2[i]); + } + + let expect = [ + 0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, + 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, + 0x64, 0xd5, 0xe5, 0x54, 0x9a, + ]; + mul_slice(177, &input, &mut output1); + for i in 0..input.len() { + assert_eq!(expect[i], output1[i]); + } + mul_slice(177, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect[i], output2[i]); + } + + let expect_xor = [ + 0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, + 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, + 0x4a, 0x8e, 0xe8, 0x2c, 0x7d, + ]; + mul_slice_xor(117, &input, &mut output1); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output1[i]); + } + mul_slice_xor(117, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output2[i]); + } + + assert_eq!(exp(2, 2), 4); + assert_eq!(exp(5, 20), 235); + assert_eq!(exp(13, 7), 43); + } + + #[test] + fn test_slice_add() { + let length_list = [16, 32, 34]; + for len in length_list.iter() { + let mut input = vec![0; *len]; + fill_random(&mut input); + let mut output = vec![0; *len]; + fill_random(&mut output); + let mut expect = vec![0; *len]; + for i in 0..expect.len() { + expect[i] = input[i] ^ output[i]; + } + slice_xor(&input, &mut output); + for i in 0..expect.len() { + assert_eq!(expect[i], output[i]); + } + fill_random(&mut output); + for i in 0..expect.len() { + expect[i] = input[i] ^ output[i]; + } + slice_xor(&input, &mut output); + for i in 0..expect.len() { + assert_eq!(expect[i], output[i]); + } + } + } + + #[test] + fn test_div_a_is_0() { + assert_eq!(0, div(0, 100)); + } + + #[test] + #[should_panic] + fn test_div_b_is_0() { + div(1, 0); + } + + #[test] + fn test_same_as_maybe_ffi() { + let len = 10_003; + for _ in 0..100 { + let c = rand::random::(); + let mut input = vec![0; len]; + fill_random(&mut input); + { + let mut output = vec![0; len]; + fill_random(&mut output); + let mut output_copy = output.clone(); + + mul_slice(c, &input, &mut output); + mul_slice(c, &input, &mut output_copy); + + assert_eq!(output, output_copy); + } + { + let mut output = vec![0; len]; + fill_random(&mut output); + let mut output_copy = output.clone(); + + mul_slice_xor(c, &input, &mut output); + mul_slice_xor(c, &input, &mut output_copy); + + assert_eq!(output, output_copy); + } + } + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs new file mode 100644 index 000000000..0ba04ae0e --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs @@ -0,0 +1,200 @@ +//! This crate provides an encoder/decoder for Reed-Solomon erasure code. +//! +//! Please note that erasure coding means errors are not directly detected or corrected, +//! but missing data pieces (shards) can be reconstructed given that +//! the configuration provides high enough redundancy. +//! +//! You will have to implement error detection separately (e.g. via checksums) +//! and simply leave out the corrupted shards when attempting to reconstruct +//! the missing data. +#![allow(dead_code)] +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(test)] +#[macro_use] +extern crate quickcheck; + +#[cfg(test)] +extern crate rand; + +extern crate smallvec; + +#[cfg(feature = "simd-accel")] +extern crate libc; + +use ::core::iter; +use ::core::iter::FromIterator; + +#[macro_use] +mod macros; + +mod core; +mod errors; +mod matrix; + +#[cfg(test)] +mod tests; + +pub mod galois_16; +pub mod galois_8; + +pub use crate::errors::Error; +pub use crate::errors::SBSError; + +pub use crate::core::ReedSolomon; +pub use crate::core::ShardByShard; + +// TODO: Can be simplified once https://github.com/rust-lang/rfcs/issues/2505 is resolved +#[cfg(not(feature = "std"))] +use libm::log2f as log2; +#[cfg(feature = "std")] +fn log2(n: f32) -> f32 { + n.log2() +} + +/// A finite field to perform encoding over. +pub trait Field: Sized { + /// The order of the field. This is a limit on the number of shards + /// in an encoding. + const ORDER: usize; + + /// The representational type of the field. + type Elem: Default + Clone + Copy + PartialEq + ::core::fmt::Debug; + + /// Add two elements together. + fn add(a: Self::Elem, b: Self::Elem) -> Self::Elem; + + /// Multiply two elements together. + fn mul(a: Self::Elem, b: Self::Elem) -> Self::Elem; + + /// Divide a by b. Panics is b is zero. + fn div(a: Self::Elem, b: Self::Elem) -> Self::Elem; + + /// Raise `a` to the n'th power. + fn exp(a: Self::Elem, n: usize) -> Self::Elem; + + /// The "zero" element or additive identity. + fn zero() -> Self::Elem; + + /// The "one" element or multiplicative identity. + fn one() -> Self::Elem; + + fn nth_internal(n: usize) -> Self::Elem; + + /// Yield the nth element of the field. Panics if n >= ORDER. + /// Assignment is arbitrary but must be unique to `n`. + fn nth(n: usize) -> Self::Elem { + if n >= Self::ORDER { + let pow = log2(Self::ORDER as f32) as usize; + panic!("{} out of bounds for GF(2^{}) member", n, pow) + } + + Self::nth_internal(n) + } + + /// Multiply a slice of elements by another. Writes into the output slice. + /// + /// # Panics + /// Panics if the output slice does not have equal length to the input. + fn mul_slice(elem: Self::Elem, input: &[Self::Elem], out: &mut [Self::Elem]) { + assert_eq!(input.len(), out.len()); + + for (i, o) in input.iter().zip(out) { + *o = Self::mul(elem.clone(), i.clone()) + } + } + + /// Multiply a slice of elements by another, adding each result to the corresponding value in + /// `out`. + /// + /// # Panics + /// Panics if the output slice does not have equal length to the input. + fn mul_slice_add(elem: Self::Elem, input: &[Self::Elem], out: &mut [Self::Elem]) { + assert_eq!(input.len(), out.len()); + + for (i, o) in input.iter().zip(out) { + *o = Self::add(o.clone(), Self::mul(elem.clone(), i.clone())) + } + } +} + +/// Something which might hold a shard. +/// +/// This trait is used in reconstruction, where some of the shards +/// may be unknown. +pub trait ReconstructShard { + /// The size of the shard data; `None` if empty. + fn len(&self) -> Option; + + /// Get a mutable reference to the shard data, returning `None` if uninitialized. + fn get(&mut self) -> Option<&mut [F::Elem]>; + + /// Get a mutable reference to the shard data, initializing it to the + /// given length if it was `None`. Returns an error if initialization fails. + fn get_or_initialize( + &mut self, + len: usize, + ) -> Result<&mut [F::Elem], Result<&mut [F::Elem], Error>>; +} + +impl + AsMut<[F::Elem]> + FromIterator> ReconstructShard + for Option +{ + fn len(&self) -> Option { + self.as_ref().map(|x| x.as_ref().len()) + } + + fn get(&mut self) -> Option<&mut [F::Elem]> { + self.as_mut().map(|x| x.as_mut()) + } + + fn get_or_initialize( + &mut self, + len: usize, + ) -> Result<&mut [F::Elem], Result<&mut [F::Elem], Error>> { + let is_some = self.is_some(); + let x = self + .get_or_insert_with(|| iter::repeat(F::zero()).take(len).collect()) + .as_mut(); + + if is_some { + Ok(x) + } else { + Err(Ok(x)) + } + } +} + +impl + AsMut<[F::Elem]>> ReconstructShard for (T, bool) { + fn len(&self) -> Option { + if !self.1 { + None + } else { + Some(self.0.as_ref().len()) + } + } + + fn get(&mut self) -> Option<&mut [F::Elem]> { + if !self.1 { + None + } else { + Some(self.0.as_mut()) + } + } + + fn get_or_initialize( + &mut self, + len: usize, + ) -> Result<&mut [F::Elem], Result<&mut [F::Elem], Error>> { + let x = self.0.as_mut(); + if x.len() == len { + if self.1 { + Ok(x) + } else { + Err(Ok(x)) + } + } else { + Err(Err(Error::IncorrectShardSize)) + } + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs new file mode 100644 index 000000000..340b27430 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs @@ -0,0 +1,245 @@ +/// Constructs vector of shards. +/// +/// # Example +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # use reed_solomon_erasure::*; +/// # fn main () { +/// let shards: Vec> = shards!([1, 2, 3], +/// [4, 5, 6]); +/// # } +/// ``` +#[macro_export] +macro_rules! shards { + ( + $( [ $( $x:expr ),* ] ),* + ) => {{ + vec![ $( vec![ $( $x ),* ] ),* ] + }} +} + +/// Makes it easier to work with 2D slices, arrays, etc. +/// +/// # Examples +/// ## Byte arrays on stack to `Vec<&[u8]>` +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # fn main () { +/// let array: [[u8; 3]; 2] = [[1, 2, 3], +/// [4, 5, 6]]; +/// +/// let refs: Vec<&[u8]> = +/// convert_2D_slices!(array =>to_vec &[u8]); +/// # } +/// ``` +/// ## Byte arrays on stack to `Vec<&mut [u8]>` (borrow mutably) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # fn main () { +/// let mut array: [[u8; 3]; 2] = [[1, 2, 3], +/// [4, 5, 6]]; +/// +/// let refs: Vec<&mut [u8]> = +/// convert_2D_slices!(array =>to_mut_vec &mut [u8]); +/// # } +/// ``` +/// ## Byte arrays on stack to `SmallVec<[&mut [u8]; 32]>` (borrow mutably) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # extern crate smallvec; +/// # use smallvec::SmallVec; +/// # fn main () { +/// let mut array: [[u8; 3]; 2] = [[1, 2, 3], +/// [4, 5, 6]]; +/// +/// let refs: SmallVec<[&mut [u8]; 32]> = +/// convert_2D_slices!(array =>to_mut SmallVec<[&mut [u8]; 32]>, +/// SmallVec::with_capacity); +/// # } +/// ``` +/// ## Shard array to `SmallVec<[&mut [u8]; 32]>` (borrow mutably) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # extern crate smallvec; +/// # use smallvec::SmallVec; +/// # fn main () { +/// let mut shards = shards!([1, 2, 3], +/// [4, 5, 6]); +/// +/// let refs: SmallVec<[&mut [u8]; 32]> = +/// convert_2D_slices!(shards =>to_mut SmallVec<[&mut [u8]; 32]>, +/// SmallVec::with_capacity); +/// # } +/// ``` +/// ## Shard array to `Vec<&mut [u8]>` (borrow mutably) into `SmallVec<[&mut [u8]; 32]>` (move) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # extern crate smallvec; +/// # use smallvec::SmallVec; +/// # fn main () { +/// let mut shards = shards!([1, 2, 3], +/// [4, 5, 6]); +/// +/// let refs1 = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); +/// +/// let refs2: SmallVec<[&mut [u8]; 32]> = +/// convert_2D_slices!(refs1 =>into SmallVec<[&mut [u8]; 32]>, +/// SmallVec::with_capacity); +/// # } +/// ``` +#[macro_export] +macro_rules! convert_2D_slices { + ( + $slice:expr =>into_vec $dst_type:ty + ) => { + convert_2D_slices!($slice =>into Vec<$dst_type>, + Vec::with_capacity) + }; + ( + $slice:expr =>to_vec $dst_type:ty + ) => { + convert_2D_slices!($slice =>to Vec<$dst_type>, + Vec::with_capacity) + }; + ( + $slice:expr =>to_mut_vec $dst_type:ty + ) => { + convert_2D_slices!($slice =>to_mut Vec<$dst_type>, + Vec::with_capacity) + }; + ( + $slice:expr =>into $dst_type:ty, $with_capacity:path + ) => {{ + let mut result: $dst_type = + $with_capacity($slice.len()); + for i in $slice.into_iter() { + result.push(i); + } + result + }}; + ( + $slice:expr =>to $dst_type:ty, $with_capacity:path + ) => {{ + let mut result: $dst_type = + $with_capacity($slice.len()); + for i in $slice.iter() { + result.push(i); + } + result + }}; + ( + $slice:expr =>to_mut $dst_type:ty, $with_capacity:path + ) => {{ + let mut result: $dst_type = + $with_capacity($slice.len()); + for i in $slice.iter_mut() { + result.push(i); + } + result + }} +} + +macro_rules! check_slices { + ( + multi => $slices:expr + ) => {{ + let size = $slices[0].as_ref().len(); + if size == 0 { + return Err(Error::EmptyShard); + } + for slice in $slices.iter() { + if slice.as_ref().len() != size { + return Err(Error::IncorrectShardSize); + } + } + }}; + ( + single => $slice_left:expr, single => $slice_right:expr + ) => {{ + if $slice_left.as_ref().len() != $slice_right.as_ref().len() { + return Err(Error::IncorrectShardSize); + } + }}; + ( + multi => $slices:expr, single => $single:expr + ) => {{ + check_slices!(multi => $slices); + + check_slices!(single => $slices[0], single => $single); + }}; + ( + multi => $slices_left:expr, multi => $slices_right:expr + ) => {{ + check_slices!(multi => $slices_left); + check_slices!(multi => $slices_right); + + check_slices!(single => $slices_left[0], single => $slices_right[0]); + }} +} + +macro_rules! check_slice_index { + ( + all => $codec:expr, $index:expr + ) => {{ + if $index >= $codec.total_shard_count { + return Err(Error::InvalidIndex); + } + }}; + ( + data => $codec:expr, $index:expr + ) => {{ + if $index >= $codec.data_shard_count { + return Err(Error::InvalidIndex); + } + }}; + ( + parity => $codec:expr, $index:expr + ) => {{ + if $index >= $codec.parity_shard_count { + return Err(Error::InvalidIndex); + } + }}; +} + +macro_rules! check_piece_count { + ( + all => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.total_shard_count { + return Err(Error::TooFewShards); + } + if $pieces.as_ref().len() > $codec.total_shard_count { + return Err(Error::TooManyShards); + } + }}; + ( + data => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.data_shard_count { + return Err(Error::TooFewDataShards); + } + if $pieces.as_ref().len() > $codec.data_shard_count { + return Err(Error::TooManyDataShards); + } + }}; + ( + parity => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.parity_shard_count { + return Err(Error::TooFewParityShards); + } + if $pieces.as_ref().len() > $codec.parity_shard_count { + return Err(Error::TooManyParityShards); + } + }}; + ( + parity_buf => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.parity_shard_count { + return Err(Error::TooFewBufferShards); + } + if $pieces.as_ref().len() > $codec.parity_shard_count { + return Err(Error::TooManyBufferShards); + } + }}; +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs new file mode 100644 index 000000000..508d43046 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs @@ -0,0 +1,425 @@ +#![allow(dead_code)] +extern crate alloc; + +use alloc::vec; +use alloc::vec::Vec; + +use crate::Field; +use smallvec::SmallVec; + +#[derive(Debug)] +pub enum Error { + SingularMatrix, +} + +macro_rules! acc { + ( + $m:ident, $r:expr, $c:expr + ) => { + $m.data[$r * $m.col_count + $c] + }; +} + +pub fn flatten(m: Vec>) -> Vec { + let mut result: Vec = Vec::with_capacity(m.len() * m[0].len()); + for row in m { + for v in row { + result.push(v); + } + } + result +} + +#[derive(PartialEq, Debug, Clone)] +pub struct Matrix { + row_count: usize, + col_count: usize, + data: SmallVec<[F::Elem; 1024]>, // store in flattened structure + // the smallvec can hold a matrix of size up to 32x32 in stack +} + +fn calc_matrix_row_start_end(col_count: usize, row: usize) -> (usize, usize) { + let start = row * col_count; + let end = start + col_count; + + (start, end) +} + +impl Matrix { + fn calc_row_start_end(&self, row: usize) -> (usize, usize) { + calc_matrix_row_start_end(self.col_count, row) + } + + pub fn new(rows: usize, cols: usize) -> Matrix { + let data = SmallVec::from_vec(vec![F::zero(); rows * cols]); + + Matrix { + row_count: rows, + col_count: cols, + data, + } + } + + pub fn new_with_data(init_data: Vec>) -> Matrix { + let rows = init_data.len(); + let cols = init_data[0].len(); + + for r in init_data.iter() { + if r.len() != cols { + panic!("Inconsistent row sizes") + } + } + + let data = SmallVec::from_vec(flatten(init_data)); + + Matrix { + row_count: rows, + col_count: cols, + data, + } + } + + #[cfg(test)] + pub fn make_random(size: usize) -> Matrix + where + rand::distributions::Standard: rand::distributions::Distribution, + { + let mut vec: Vec> = vec![vec![Default::default(); size]; size]; + for v in vec.iter_mut() { + crate::tests::fill_random(v); + } + + Matrix::new_with_data(vec) + } + + pub fn identity(size: usize) -> Matrix { + let mut result = Self::new(size, size); + for i in 0..size { + acc!(result, i, i) = F::one(); + } + result + } + + pub fn col_count(&self) -> usize { + self.col_count + } + + pub fn row_count(&self) -> usize { + self.row_count + } + + pub fn get(&self, r: usize, c: usize) -> F::Elem { + acc!(self, r, c).clone() + } + + pub fn set(&mut self, r: usize, c: usize, val: F::Elem) { + acc!(self, r, c) = val; + } + + pub fn multiply(&self, rhs: &Matrix) -> Matrix { + if self.col_count != rhs.row_count { + panic!( + "Colomn count on left is different from row count on right, lhs: {}, rhs: {}", + self.col_count, rhs.row_count + ) + } + let mut result = Self::new(self.row_count, rhs.col_count); + for r in 0..self.row_count { + for c in 0..rhs.col_count { + let mut val = F::zero(); + for i in 0..self.col_count { + let mul = F::mul(acc!(self, r, i).clone(), acc!(rhs, i, c).clone()); + + val = F::add(val, mul); + } + acc!(result, r, c) = val; + } + } + result + } + + pub fn augment(&self, rhs: &Matrix) -> Matrix { + if self.row_count != rhs.row_count { + panic!( + "Matrices do not have the same row count, lhs: {}, rhs: {}", + self.row_count, rhs.row_count + ) + } + let mut result = Self::new(self.row_count, self.col_count + rhs.col_count); + for r in 0..self.row_count { + for c in 0..self.col_count { + acc!(result, r, c) = acc!(self, r, c).clone(); + } + let self_column_count = self.col_count; + for c in 0..rhs.col_count { + acc!(result, r, self_column_count + c) = acc!(rhs, r, c).clone(); + } + } + + result + } + + pub fn sub_matrix(&self, rmin: usize, cmin: usize, rmax: usize, cmax: usize) -> Matrix { + let mut result = Self::new(rmax - rmin, cmax - cmin); + for r in rmin..rmax { + for c in cmin..cmax { + acc!(result, r - rmin, c - cmin) = acc!(self, r, c).clone(); + } + } + result + } + + pub fn get_row(&self, row: usize) -> &[F::Elem] { + let (start, end) = self.calc_row_start_end(row); + + &self.data[start..end] + } + + pub fn swap_rows(&mut self, r1: usize, r2: usize) { + let (r1_s, _) = self.calc_row_start_end(r1); + let (r2_s, _) = self.calc_row_start_end(r2); + + if r1 == r2 { + return; + } else { + for i in 0..self.col_count { + self.data.swap(r1_s + i, r2_s + i); + } + } + } + + pub fn is_square(&self) -> bool { + self.row_count == self.col_count + } + + pub fn gaussian_elim(&mut self) -> Result<(), Error> { + for r in 0..self.row_count { + if acc!(self, r, r) == F::zero() { + for r_below in r + 1..self.row_count { + if acc!(self, r_below, r) != F::zero() { + self.swap_rows(r, r_below); + break; + } + } + } + // If we couldn't find one, the matrix is singular. + if acc!(self, r, r) == F::zero() { + return Err(Error::SingularMatrix); + } + // Scale to 1. + if acc!(self, r, r) != F::one() { + let scale = F::div(F::one(), acc!(self, r, r).clone()); + for c in 0..self.col_count { + acc!(self, r, c) = F::mul(scale, acc!(self, r, c).clone()); + } + } + // Make everything below the 1 be a 0 by subtracting + // a multiple of it. (Subtraction and addition are + // both exclusive or in the Galois field.) + for r_below in r + 1..self.row_count { + if acc!(self, r_below, r) != F::zero() { + let scale = acc!(self, r_below, r).clone(); + for c in 0..self.col_count { + acc!(self, r_below, c) = F::add( + acc!(self, r_below, c).clone(), + F::mul(scale, acc!(self, r, c).clone()), + ); + } + } + } + } + + // Now clear the part above the main diagonal. + for d in 0..self.row_count { + for r_above in 0..d { + if acc!(self, r_above, d) != F::zero() { + let scale = acc!(self, r_above, d).clone(); + for c in 0..self.col_count { + acc!(self, r_above, c) = F::add( + acc!(self, r_above, c).clone(), + F::mul(scale, acc!(self, d, c).clone()), + ); + } + } + } + } + Ok(()) + } + + pub fn invert(&self) -> Result, Error> { + if !self.is_square() { + panic!("Trying to invert a non-square matrix") + } + + let row_count = self.row_count; + let col_count = self.col_count; + + let mut work = self.augment(&Self::identity(row_count)); + work.gaussian_elim()?; + + Ok(work.sub_matrix(0, row_count, col_count, col_count * 2)) + } + + pub fn vandermonde(rows: usize, cols: usize) -> Matrix { + let mut result = Self::new(rows, cols); + + for r in 0..rows { + // doesn't matter what `r_a` is as long as it's unique. + // then the vandermonde matrix is invertible. + let r_a = F::nth(r); + for c in 0..cols { + acc!(result, r, c) = F::exp(r_a, c); + } + } + + result + } +} + +#[cfg(test)] +mod tests { + extern crate alloc; + + use alloc::vec; + + use super::Matrix; + use crate::galois_8; + + macro_rules! matrix { + ( + $( + [ $( $x:expr ),+ ] + ),* + ) => ( + Matrix::::new_with_data(vec![ $( vec![$( $x ),*] ),* ]) + ); + ($rows:expr, $cols:expr) => (Matrix::new($rows, $cols)); + } + + #[test] + fn test_matrix_col_count() { + let m1 = matrix!([1, 0, 0]); + let m2 = matrix!([0, 0, 0], [0, 0, 0]); + let m3: Matrix = Matrix::new(1, 4); + + assert_eq!(3, m1.col_count()); + assert_eq!(3, m2.col_count()); + assert_eq!(4, m3.col_count()); + } + + #[test] + fn test_matrix_row_count() { + let m1 = matrix!([1, 0, 0]); + let m2 = matrix!([0, 0, 0], [0, 0, 0]); + let m3: Matrix = Matrix::new(1, 4); + + assert_eq!(1, m1.row_count()); + assert_eq!(2, m2.row_count()); + assert_eq!(1, m3.row_count()); + } + + #[test] + fn test_matrix_swap_rows() { + { + let mut m1 = matrix!([1, 2, 3], [4, 5, 6], [7, 8, 9]); + let expect = matrix!([7, 8, 9], [4, 5, 6], [1, 2, 3]); + m1.swap_rows(0, 2); + assert_eq!(expect, m1); + } + { + let mut m1 = matrix!([1, 2, 3], [4, 5, 6], [7, 8, 9]); + let expect = m1.clone(); + m1.swap_rows(0, 0); + assert_eq!(expect, m1); + m1.swap_rows(1, 1); + assert_eq!(expect, m1); + m1.swap_rows(2, 2); + assert_eq!(expect, m1); + } + } + + #[test] + #[should_panic] + fn test_inconsistent_row_sizes() { + matrix!([1, 0, 0], [0, 1], [0, 0, 1]); + } + + #[test] + #[should_panic] + fn test_incompatible_multiply() { + let m1 = matrix!([0, 1], [0, 1], [0, 1]); + let m2 = matrix!([0, 1, 2]); + + m1.multiply(&m2); + } + + #[test] + #[should_panic] + fn test_incompatible_augment() { + let m1 = matrix!([0, 1]); + let m2 = matrix!([0, 1], [2, 3]); + + m1.augment(&m2); + } + + #[test] + fn test_matrix_identity() { + let m1 = Matrix::identity(3); + let m2 = matrix!([1, 0, 0], [0, 1, 0], [0, 0, 1]); + assert_eq!(m1, m2); + } + + #[test] + fn test_matrix_multiply() { + let m1 = matrix!([1, 2], [3, 4]); + let m2 = matrix!([5, 6], [7, 8]); + let actual = m1.multiply(&m2); + let expect = matrix!([11, 22], [19, 42]); + assert_eq!(actual, expect); + } + + #[test] + fn test_matrix_inverse_pass_cases() { + { + // Test case validating inverse of the input Matrix. + let m = matrix!([56, 23, 98], [3, 100, 200], [45, 201, 123]) + .invert() + .unwrap(); + let expect = matrix!([175, 133, 33], [130, 13, 245], [112, 35, 126]); + assert_eq!(m, expect); + } + { + // Test case validating inverse of the input Matrix. + let m = matrix!( + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + [7, 7, 6, 6, 1] + ) + .invert() + .unwrap(); + let expect = matrix!( + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [123, 123, 1, 122, 122], + [0, 0, 1, 0, 0], + [0, 0, 0, 1, 0] + ); + assert_eq!(m, expect); + } + } + + #[test] + #[should_panic] + fn test_matrix_inverse_non_square() { + // Test case with a non-square matrix. + matrix!([56, 23], [3, 100], [45, 201]).invert().unwrap(); + } + + #[test] + #[should_panic] + fn test_matrix_inverse_singular() { + matrix!([4, 2], [12, 6]).invert().unwrap(); + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs new file mode 100644 index 000000000..872472e85 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs @@ -0,0 +1,489 @@ +extern crate alloc; + +use alloc::vec; +use alloc::vec::Vec; + +use super::{fill_random, option_shards_into_shards, shards_into_option_shards}; +use crate::galois_16::ReedSolomon; + +macro_rules! make_random_shards { + ($per_shard:expr, $size:expr) => {{ + let mut shards = Vec::with_capacity(20); + for _ in 0..$size { + shards.push(vec![[0; 2]; $per_shard]); + } + + for s in shards.iter_mut() { + fill_random(s); + } + + shards + }}; +} + +#[test] +fn correct_field_order_restriction() { + const ORDER: usize = 1 << 16; + + assert!(ReedSolomon::new(ORDER, 1).is_err()); + assert!(ReedSolomon::new(1, ORDER).is_err()); + + // way too slow, because it needs to build a 65536*65536 vandermonde matrix + // assert!(ReedSolomon::new(ORDER - 1, 1).is_ok()); + assert!(ReedSolomon::new(1, ORDER - 1).is_ok()); +} + +quickcheck! { + fn qc_encode_verify_reconstruct_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + let mut slice_present = vec![true; data + parity]; + for &p in corrupt_pos_s.iter() { + slice_present[p] = false; + } + + // reconstruct + { + let mut refs: Vec<_> = shards.iter_mut() + .map(|i| &mut i[..]) + .zip(slice_present.iter().cloned()) + .collect(); + + r.reconstruct(&mut refs[..]).unwrap(); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[[u8; 2]]); + + r.verify(&refs).unwrap() + }) + && + expect == shards + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[[u8; 2]]); + + r.verify(&refs).unwrap() + }) + } + + fn qc_encode_verify_reconstruct_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = shards_into_option_shards(expect.clone()); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + shards[p] = None; + } + + // reconstruct + r.reconstruct(&mut shards).unwrap(); + + let shards = option_shards_into_shards(shards); + + r.verify(&expect).unwrap() + && expect == shards + && r.verify(&shards).unwrap() + } + + fn qc_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[[u8; 2]]); + + r.verify(&refs).unwrap() + }) + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[[u8; 2]]); + + (corrupt > 0 && !r.verify(&refs).unwrap()) + || (corrupt == 0 && r.verify(&refs).unwrap()) + }) + } + + fn qc_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + r.verify(&expect).unwrap() + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ((corrupt > 0 && !r.verify(&shards).unwrap()) + || (corrupt == 0 && r.verify(&shards).unwrap())) + } + + fn qc_encode_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data =>to_mut_vec &[[u8; 2]]); + + let mut parity_refs = + convert_2D_slices!(parity =>to_mut_vec &mut [[u8; 2]]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + r.encode_sep(data, parity).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let mut refs = + convert_2D_slices!(shards =>to_mut_vec &mut [[u8; 2]]); + + for i in 0..data { + r.encode_single(i, &mut refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + for i in 0..data { + r.encode_single(i, &mut shards).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[[u8; 2]]); + + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [[u8; 2]]); + + for i in 0..data { + r.encode_single_sep(i, data_refs[i], &mut parity_refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + for i in 0..data { + r.encode_single_sep(i, &data_shards[i], parity_shards).unwrap(); + } + } + + let shards = shards; + + expect == shards + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs new file mode 100644 index 000000000..488443b25 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs @@ -0,0 +1,2619 @@ +#![allow(dead_code)] + +extern crate alloc; + +use alloc::vec; +use alloc::vec::Vec; + +use super::{galois_8, Error, SBSError}; +use rand::{self, thread_rng, Rng}; + +mod galois_16; + +type ReedSolomon = crate::ReedSolomon; +type ShardByShard<'a> = crate::ShardByShard<'a, galois_8::Field>; + +macro_rules! make_random_shards { + ($per_shard:expr, $size:expr) => {{ + let mut shards = Vec::with_capacity(20); + for _ in 0..$size { + shards.push(vec![0; $per_shard]); + } + + for s in shards.iter_mut() { + fill_random(s); + } + + shards + }}; +} + +fn assert_eq_shards(s1: &[T], s2: &[U]) +where + T: AsRef<[u8]>, + U: AsRef<[u8]>, +{ + assert_eq!(s1.len(), s2.len()); + for i in 0..s1.len() { + assert_eq!(s1[i].as_ref(), s2[i].as_ref()); + } +} + +pub fn fill_random(arr: &mut [T]) +where + rand::distributions::Standard: rand::distributions::Distribution, +{ + for a in arr.iter_mut() { + *a = rand::random::(); + } +} + +fn shards_to_option_shards(shards: &[Vec]) -> Vec>> { + let mut result = Vec::with_capacity(shards.len()); + + for v in shards.iter() { + let inner: Vec = v.clone(); + result.push(Some(inner)); + } + result +} + +fn shards_into_option_shards(shards: Vec>) -> Vec>> { + let mut result = Vec::with_capacity(shards.len()); + + for v in shards { + result.push(Some(v)); + } + result +} + +fn option_shards_to_shards(shards: &[Option>]) -> Vec> { + let mut result = Vec::with_capacity(shards.len()); + + for i in 0..shards.len() { + let shard = match shards[i] { + Some(ref x) => x, + None => panic!("Missing shard, index : {}", i), + }; + let inner: Vec = shard.clone(); + result.push(inner); + } + result +} + +fn option_shards_into_shards(shards: Vec>>) -> Vec> { + let mut result = Vec::with_capacity(shards.len()); + + for shard in shards { + let shard = match shard { + Some(x) => x, + None => panic!("Missing shard"), + }; + result.push(shard); + } + result +} + +#[test] +fn test_no_data_shards() { + assert_eq!(Error::TooFewDataShards, ReedSolomon::new(0, 1).unwrap_err()); +} + +#[test] +fn test_no_parity_shards() { + assert_eq!( + Error::TooFewParityShards, + ReedSolomon::new(1, 0).unwrap_err() + ); +} + +#[test] +fn test_too_many_shards() { + assert_eq!( + Error::TooManyShards, + ReedSolomon::new(129, 128).unwrap_err() + ); +} + +#[test] +fn test_shard_count() { + let mut rng = thread_rng(); + for _ in 0..10 { + let data_shard_count = rng.gen_range(1, 128); + let parity_shard_count = rng.gen_range(1, 128); + + let total_shard_count = data_shard_count + parity_shard_count; + + let r = ReedSolomon::new(data_shard_count, parity_shard_count).unwrap(); + + assert_eq!(data_shard_count, r.data_shard_count()); + assert_eq!(parity_shard_count, r.parity_shard_count()); + assert_eq!(total_shard_count, r.total_shard_count()); + } +} + +#[test] +fn test_reed_solomon_clone() { + let r1 = ReedSolomon::new(10, 3).unwrap(); + let r2 = r1.clone(); + + assert_eq!(r1, r2); +} + +#[test] +fn test_encoding() { + let per_shard = 50_000; + + let r = ReedSolomon::new(10, 3).unwrap(); + + let mut shards = make_random_shards!(per_shard, 13); + + r.encode(&mut shards).unwrap(); + assert!(r.verify(&shards).unwrap()); + + assert_eq!( + Error::TooFewShards, + r.encode(&mut shards[0..1]).unwrap_err() + ); + + let mut bad_shards = make_random_shards!(per_shard, 13); + bad_shards[0] = vec![0 as u8]; + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut bad_shards).unwrap_err() + ); +} + +#[test] +fn test_reconstruct_shards() { + let per_shard = 100_000; + + let r = ReedSolomon::new(8, 5).unwrap(); + + let mut shards = make_random_shards!(per_shard, 13); + + r.encode(&mut shards).unwrap(); + + let master_copy = shards.clone(); + + let mut shards = shards_to_option_shards(&shards); + + // Try to decode with all shards present + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to decode with 10 shards + shards[0] = None; + shards[2] = None; + //shards[4] = None; + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to decode the same shards again to try to + // trigger the usage of cached decode matrix + shards[0] = None; + shards[2] = None; + //shards[4] = None; + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to deocde with 6 data and 4 parity shards + shards[0] = None; + shards[2] = None; + shards[12] = None; + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to reconstruct data only + shards[0] = None; + shards[1] = None; + shards[12] = None; + r.reconstruct_data(&mut shards).unwrap(); + { + let data_shards = option_shards_to_shards(&shards[0..8]); + assert_eq!(master_copy[0], data_shards[0]); + assert_eq!(master_copy[1], data_shards[1]); + assert_eq!(None, shards[12]); + } + + // Try to decode with 7 data and 1 parity shards + shards[0] = None; + shards[1] = None; + shards[9] = None; + shards[10] = None; + shards[11] = None; + shards[12] = None; + assert_eq!( + r.reconstruct(&mut shards).unwrap_err(), + Error::TooFewShardsPresent + ); +} + +#[test] +fn test_reconstruct() { + let r = ReedSolomon::new(2, 2).unwrap(); + + let mut shards: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [200, 201, 203], [100, 101, 102]]; + + { + { + let mut shard_refs: Vec<&mut [u8]> = Vec::with_capacity(3); + + for shard in shards.iter_mut() { + shard_refs.push(shard); + } + + r.encode(&mut shard_refs).unwrap(); + } + + let shard_refs: Vec<_> = shards.iter().map(|i| &i[..]).collect(); + assert!(r.verify(&shard_refs).unwrap()); + } + + { + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[0][0] = 101; + shard_refs[0][1] = 102; + shard_refs[0][2] = 103; + + let shards_present = [false, true, true, true]; + + let mut shards = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect::>(); + + r.reconstruct(&mut shards[..]).unwrap(); + } + + let shard_refs: Vec<_> = shards.iter().map(|i| &i[..]).collect(); + assert!(r.verify(&shard_refs).unwrap()); + } + + let expect: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [6, 11, 12], [5, 14, 11]]; + assert_eq!(expect, shards); + + { + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[0][0] = 201; + shard_refs[0][1] = 202; + shard_refs[0][2] = 203; + + shard_refs[2][0] = 101; + shard_refs[2][1] = 102; + shard_refs[2][2] = 103; + + let shards_present = [false, true, false, true]; + + let mut shards = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect::>(); + + r.reconstruct_data(&mut shards[..]).unwrap(); + } + + let shard_refs = convert_2D_slices!(shards =>to_vec &[u8]); + + assert!(!r.verify(&shard_refs).unwrap()); + } + + let expect: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [101, 102, 103], [5, 14, 11]]; + assert_eq!(expect, shards); + + { + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[2][0] = 101; + shard_refs[2][1] = 102; + shard_refs[2][2] = 103; + + shard_refs[3][0] = 201; + shard_refs[3][1] = 202; + shard_refs[3][2] = 203; + + let shards_present = [true, true, false, false]; + + let mut shards = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect::>(); + + r.reconstruct_data(&mut shards[..]).unwrap(); + } + + let shard_refs = convert_2D_slices!(shards =>to_vec &[u8]); + + assert!(!r.verify(&shard_refs).unwrap()); + } + + let expect: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [101, 102, 103], [201, 202, 203]]; + assert_eq!(expect, shards); +} + +quickcheck! { + fn qc_encode_verify_reconstruct_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + let mut slice_present = vec![true; data + parity]; + for &p in corrupt_pos_s.iter() { + slice_present[p] = false; + } + + // reconstruct + { + let mut refs: Vec<_> = shards.iter_mut() + .map(|i| &mut i[..]) + .zip(slice_present.iter().cloned()) + .collect(); + + r.reconstruct(&mut refs[..]).unwrap(); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[u8]); + + r.verify(&refs).unwrap() + }) + && + expect == shards + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[u8]); + + r.verify(&refs).unwrap() + }) + } + + fn qc_encode_verify_reconstruct_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = shards_into_option_shards(expect.clone()); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + shards[p] = None; + } + + // reconstruct + r.reconstruct(&mut shards).unwrap(); + + let shards = option_shards_into_shards(shards); + + r.verify(&expect).unwrap() + && expect == shards + && r.verify(&shards).unwrap() + } + + fn qc_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[u8]); + + r.verify(&refs).unwrap() + }) + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[u8]); + + (corrupt > 0 && !r.verify(&refs).unwrap()) + || (corrupt == 0 && r.verify(&refs).unwrap()) + }) + } + + fn qc_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + r.verify(&expect).unwrap() + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ((corrupt > 0 && !r.verify(&shards).unwrap()) + || (corrupt == 0 && r.verify(&shards).unwrap())) + } + + fn qc_encode_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data =>to_mut_vec &[u8]); + + let mut parity_refs = + convert_2D_slices!(parity =>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + r.encode_sep(data, parity).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let mut refs = + convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + for i in 0..data { + r.encode_single(i, &mut refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + for i in 0..data { + r.encode_single(i, &mut shards).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[u8]); + + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [u8]); + + for i in 0..data { + r.encode_single_sep(i, data_refs[i], &mut parity_refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + for i in 0..data { + r.encode_single_sep(i, &data_shards[i], parity_shards).unwrap(); + } + } + + let shards = shards; + + expect == shards + } +} + +#[test] +fn test_reconstruct_error_handling() { + let r = ReedSolomon::new(2, 2).unwrap(); + + let mut shards: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [200, 201, 203], [100, 101, 102]]; + + { + let mut shard_refs: Vec<&mut [u8]> = Vec::with_capacity(3); + + for shard in shards.iter_mut() { + shard_refs.push(shard); + } + + r.encode(&mut shard_refs).unwrap(); + } + + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[0][0] = 101; + shard_refs[0][1] = 102; + shard_refs[0][2] = 103; + + let shards_present = [true, false, false, false]; + + let mut shard_refs: Vec<_> = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect(); + + assert_eq!( + Error::TooFewShardsPresent, + r.reconstruct(&mut shard_refs[..]).unwrap_err() + ); + + shard_refs[3].1 = true; + r.reconstruct(&mut shard_refs).unwrap(); + } +} + +#[test] +fn test_one_encode() { + let r = ReedSolomon::new(5, 5).unwrap(); + + let mut shards = shards!( + [0, 1], + [4, 5], + [2, 3], + [6, 7], + [8, 9], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 0] + ); + + r.encode(&mut shards).unwrap(); + { + assert_eq!(shards[5][0], 12); + assert_eq!(shards[5][1], 13); + } + { + assert_eq!(shards[6][0], 10); + assert_eq!(shards[6][1], 11); + } + { + assert_eq!(shards[7][0], 14); + assert_eq!(shards[7][1], 15); + } + { + assert_eq!(shards[8][0], 90); + assert_eq!(shards[8][1], 91); + } + { + assert_eq!(shards[9][0], 94); + assert_eq!(shards[9][1], 95); + } + + assert!(r.verify(&shards).unwrap()); + + shards[8][0] += 1; + assert!(!r.verify(&shards).unwrap()); +} + +#[test] +fn test_verify_too_few_shards() { + let r = ReedSolomon::new(3, 2).unwrap(); + + let shards = make_random_shards!(10, 4); + + assert_eq!(Error::TooFewShards, r.verify(&shards).unwrap_err()); +} + +#[test] +fn test_verify_shards_with_buffer_incorrect_buffer_sizes() { + let r = ReedSolomon::new(3, 2).unwrap(); + + { + // Test too few slices in buffer + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 1]; + + assert_eq!( + Error::TooFewBufferShards, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } + { + // Test too many slices in buffer + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 3]; + + assert_eq!( + Error::TooManyBufferShards, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } + { + // Test correct number of slices in buffer + let mut shards = make_random_shards!(100, 5); + + r.encode(&mut shards).unwrap(); + + let mut buffer = vec![vec![0; 100]; 2]; + + assert_eq!(true, r.verify_with_buffer(&shards, &mut buffer).unwrap()); + } + { + // Test having first buffer being empty + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 2]; + buffer[0] = vec![]; + + assert_eq!( + Error::EmptyShard, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } + { + // Test having shards of inconsistent length in buffer + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 2]; + buffer[1] = vec![0; 99]; + + assert_eq!( + Error::IncorrectShardSize, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } +} + +#[test] +fn test_verify_shards_with_buffer_gives_correct_parity_shards() { + let r = ReedSolomon::new(10, 3).unwrap(); + + for _ in 0..100 { + let mut shards = make_random_shards!(100, 13); + let shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + { + let mut buffer = make_random_shards!(100, 3); + + assert!(!r.verify_with_buffer(&shards_copy, &mut buffer).unwrap()); + + assert_eq_shards(&shards[10..], &buffer); + } + { + let mut buffer = make_random_shards!(100, 3); + + assert!(r.verify_with_buffer(&shards, &mut buffer).unwrap()); + + assert_eq_shards(&shards[10..], &buffer); + } + } +} + +#[test] +fn test_verify_with_buffer_gives_correct_parity_shards() { + let r = ReedSolomon::new(10, 3).unwrap(); + + for _ in 0..100 { + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let slices_copy = slices.clone(); + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + } + + { + let mut buffer: [[u8; 100]; 3] = [[0; 100]; 3]; + + { + let slice_copy_refs = convert_2D_slices!(slices_copy =>to_vec &[u8]); + + for slice in buffer.iter_mut() { + fill_random(slice); + } + + let mut buffer_refs = convert_2D_slices!(buffer =>to_mut_vec &mut [u8]); + + assert!(!r + .verify_with_buffer(&slice_copy_refs, &mut buffer_refs) + .unwrap()); + } + + for a in 0..3 { + for b in 0..100 { + assert_eq!(slices[10 + a][b], buffer[a][b]); + } + } + } + + { + let mut buffer: [[u8; 100]; 3] = [[0; 100]; 3]; + + { + let slice_refs = convert_2D_slices!(slices=>to_vec &[u8]); + + for slice in buffer.iter_mut() { + fill_random(slice); + } + + let mut buffer_refs = convert_2D_slices!(buffer =>to_mut_vec &mut [u8]); + + assert!(r.verify_with_buffer(&slice_refs, &mut buffer_refs).unwrap()); + } + + for a in 0..3 { + for b in 0..100 { + assert_eq!(slices[10 + a][b], buffer[a][b]); + } + } + } + } +} + +#[test] +fn test_slices_or_shards_count_check() { + let r = ReedSolomon::new(3, 2).unwrap(); + + { + let mut shards = make_random_shards!(10, 4); + + assert_eq!(Error::TooFewShards, r.encode(&mut shards).unwrap_err()); + assert_eq!(Error::TooFewShards, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::TooFewShards, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(10, 6); + + assert_eq!(Error::TooManyShards, r.encode(&mut shards).unwrap_err()); + assert_eq!(Error::TooManyShards, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::TooManyShards, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } +} + +#[test] +fn test_check_slices_or_shards_size() { + let r = ReedSolomon::new(2, 2).unwrap(); + + { + let mut shards = shards!([0, 0, 0], [0, 1], [1, 2, 3], [0, 0, 0]); + + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut shards).unwrap_err() + ); + assert_eq!(Error::IncorrectShardSize, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::IncorrectShardSize, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = shards!([0, 1], [0, 1], [1, 2, 3], [0, 0, 0]); + + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut shards).unwrap_err() + ); + assert_eq!(Error::IncorrectShardSize, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::IncorrectShardSize, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = shards!([0, 1], [0, 1, 4], [1, 2, 3], [0, 0, 0]); + + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut shards).unwrap_err() + ); + assert_eq!(Error::IncorrectShardSize, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::IncorrectShardSize, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = shards!([], [0, 1, 3], [1, 2, 3], [0, 0, 0]); + + assert_eq!(Error::EmptyShard, r.encode(&mut shards).unwrap_err()); + assert_eq!(Error::EmptyShard, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::EmptyShard, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut option_shards: Vec>> = vec![None, None, None, None]; + + assert_eq!( + Error::TooFewShardsPresent, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } +} + +#[test] +fn shardbyshard_encode_correctly() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards_copy).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!(shards, shards_copy); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + let mut slice_copy_refs = convert_2D_slices!(slices_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_copy_refs).unwrap(); + } + } + + assert!(sbs.parity_ready()); + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } +} + +quickcheck! { + fn qc_shardbyshard_encode_same_as_encode(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + for _ in 0..1 + reuse { + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + { + let mut slice_refs = + convert_2D_slices!(shards=>to_mut_vec &mut [u8]); + + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_refs).unwrap(); + } + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } + + fn qc_shardbyshard_encode_same_as_encode_shards(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + for _ in 0..1 + reuse { + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards).unwrap(); + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } +} + +#[test] +fn shardbyshard_encode_sep_correctly() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = shards.clone(); + + let (data, parity) = shards.split_at_mut(10); + let (data_copy, parity_copy) = shards_copy.split_at_mut(10); + + r.encode_sep(data, parity).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data_copy, parity_copy).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!(parity, parity_copy); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let (data, parity) = slices.split_at_mut(10); + let (data_copy, parity_copy) = slices_copy.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + let data_copy_refs = convert_2D_slices!(data_copy =>to_mut_vec &[u8]); + let mut parity_copy_refs = convert_2D_slices!(parity_copy =>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_copy_refs, &mut parity_copy_refs) + .unwrap(); + } + } + + assert!(sbs.parity_ready()); + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } +} + +quickcheck! { + fn qc_shardbyshard_encode_sep_same_as_encode(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + for _ in 0..1 + reuse { + { + let (data_shards, parity_shards) = + expect.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[u8]); + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + { + let (data_shards, parity_shards) = + shards.split_at_mut(data); + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[u8]); + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [u8]); + + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } + + fn qc_shardbyshard_encode_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + for _ in 0..1 + reuse { + { + let (data_shards, parity_shards) = + expect.split_at_mut(data); + + r.encode_sep(data_shards, parity_shards).unwrap(); + } + + { + let (data_shards, parity_shards) = + shards.split_at_mut(data); + + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data_shards, parity_shards).unwrap(); + } + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } +} + +#[test] +fn shardbyshard_encode_correctly_more_rigorous() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = make_random_shards!(10_000, 13); + + r.encode(&mut shards).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + shards_copy[i].clone_from_slice(&shards[i]); + sbs.encode(&mut shards_copy).unwrap(); + fill_random(&mut shards_copy[i]); + } + + assert!(sbs.parity_ready()); + + for i in 0..10 { + shards_copy[i].clone_from_slice(&shards[i]); + } + + assert_eq!(shards, shards_copy); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let mut slices_copy: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices_copy.iter_mut() { + fill_random(slice); + } + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + let mut slice_copy_refs = convert_2D_slices!(slices_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + slice_copy_refs[i].clone_from_slice(&slice_refs[i]); + sbs.encode(&mut slice_copy_refs).unwrap(); + fill_random(&mut slice_copy_refs[i]); + } + } + + for i in 0..10 { + slices_copy[i].clone_from_slice(&slices[i]); + } + + assert!(sbs.parity_ready()); + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } +} + +#[test] +fn shardbyshard_encode_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_refs).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode(&mut slice_refs).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_refs).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + { + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + sbs.encode(&mut slice_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + sbs.encode(&mut slice_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } +} + +#[test] +fn shardbyshard_encode_shard_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!(SBSError::TooManyCalls, sbs.encode(&mut shards).unwrap_err()); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + { + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + sbs.encode(&mut shards).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + sbs.encode(&mut shards).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } +} + +#[test] +fn shardbyshard_encode_sep_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + let (data, parity) = shards.split_at_mut(10); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode_sep(data, parity).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + { + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[10] = vec![]; + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[10] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[11] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[11] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } +} + +#[test] +fn shardbyshard_encode_shard_sep_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + let (data, parity) = shards.split_at_mut(10); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode_sep(data, parity).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[10] = vec![]; + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[10] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[11] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[11] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } +} + +#[test] +fn test_encode_single_sep() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(10, 13); + let mut shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + { + let (data, parity) = shards_copy.split_at_mut(10); + + for i in 0..10 { + r.encode_single_sep(i, &data[i], parity).unwrap(); + } + } + assert!(r.verify(&shards).unwrap()); + assert!(r.verify(&shards_copy).unwrap()); + + assert_eq_shards(&shards, &shards_copy); + } + { + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + + let (data_copy, parity_copy) = slices_copy.split_at_mut(10); + + let data_copy_refs = convert_2D_slices!(data_copy =>to_mut_vec &[u8]); + let mut parity_copy_refs = convert_2D_slices!(parity_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + for i in 0..10 { + r.encode_single_sep(i, &data_copy_refs[i], &mut parity_copy_refs) + .unwrap(); + } + } + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + } +} + +#[test] +fn test_encode_sep() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + { + let (data, parity) = shards_copy.split_at_mut(10); + + r.encode_sep(data, parity).unwrap(); + } + + assert_eq_shards(&shards, &shards_copy); + } + { + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let (data_copy, parity_copy) = slices_copy.split_at_mut(10); + + let mut slice_refs = convert_2D_slices!(slices =>to_mut_vec &mut [u8]); + let data_copy_refs = convert_2D_slices!(data_copy =>to_mut_vec &[u8]); + let mut parity_copy_refs = convert_2D_slices!(parity_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + r.encode_sep(&data_copy_refs, &mut parity_copy_refs) + .unwrap(); + } + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + } +} + +#[test] +fn test_encode_single_sep_error_handling() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(1000, 13); + + { + let (data, parity) = shards.split_at_mut(10); + + for i in 0..10 { + r.encode_single_sep(i, &data[i], parity).unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(10, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(11, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(12, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(13, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(14, &data[0], parity).unwrap_err() + ); + } + + { + let (data, parity) = shards.split_at_mut(11); + + assert_eq!( + Error::TooFewParityShards, + r.encode_single_sep(0, &data[0], parity).unwrap_err() + ); + } + { + let (data, parity) = shards.split_at_mut(9); + + assert_eq!( + Error::TooManyParityShards, + r.encode_single_sep(0, &data[0], parity).unwrap_err() + ); + } + } + { + let mut slices: [[u8; 1000]; 13] = [[0; 1000]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + { + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + for i in 0..10 { + r.encode_single_sep(i, &data_refs[i], &mut parity_refs) + .unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(10, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(11, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(12, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(13, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(14, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + } + { + let (data, parity) = slices.split_at_mut(11); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooFewParityShards, + r.encode_single_sep(0, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + } + { + let (data, parity) = slices.split_at_mut(9); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooManyParityShards, + r.encode_single_sep(0, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + } + } +} + +#[test] +fn test_encode_sep_error_handling() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(1000, 13); + + let (data, parity) = shards.split_at_mut(10); + + r.encode_sep(data, parity).unwrap(); + + { + let mut shards = make_random_shards!(1000, 12); + let (data, parity) = shards.split_at_mut(9); + + assert_eq!( + Error::TooFewDataShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(1000, 14); + let (data, parity) = shards.split_at_mut(11); + + assert_eq!( + Error::TooManyDataShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(1000, 12); + let (data, parity) = shards.split_at_mut(10); + + assert_eq!( + Error::TooFewParityShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(1000, 14); + let (data, parity) = shards.split_at_mut(10); + + assert_eq!( + Error::TooManyParityShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + } + { + let mut slices: [[u8; 1000]; 13] = [[0; 1000]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + { + let mut slices: [[u8; 1000]; 12] = [[0; 1000]; 12]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(9); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooFewDataShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 14] = [[0; 1000]; 14]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(11); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooManyDataShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 12] = [[0; 1000]; 12]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooFewParityShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 14] = [[0; 1000]; 14]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooManyParityShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + } +} + +#[test] +fn test_encode_single_error_handling() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(1000, 13); + + for i in 0..10 { + r.encode_single(i, &mut shards).unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single(10, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(11, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(12, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(13, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(14, &mut shards).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 13] = [[0; 1000]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + + for i in 0..10 { + r.encode_single(i, &mut slice_refs).unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single(10, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(11, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(12, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(13, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(14, &mut slice_refs).unwrap_err() + ); + } +} diff --git a/test/s3/normal/s3_integration_test.go b/test/s3/normal/s3_integration_test.go index 2f9f325c0..6abab8849 100644 --- a/test/s3/normal/s3_integration_test.go +++ b/test/s3/normal/s3_integration_test.go @@ -10,6 +10,7 @@ import ( "net" "net/http" "os" + "os/exec" "path/filepath" "strconv" "sync" @@ -24,6 +25,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" "github.com/seaweedfs/seaweedfs/weed/command" "github.com/seaweedfs/seaweedfs/weed/glog" flag "github.com/seaweedfs/seaweedfs/weed/util/fla9" @@ -37,18 +39,19 @@ const ( // TestCluster manages the weed mini instance for integration testing type TestCluster struct { - dataDir string - ctx context.Context - cancel context.CancelFunc - s3Client *s3.S3 - isRunning bool - startOnce sync.Once - wg sync.WaitGroup - masterPort int - volumePort int - filerPort int - s3Port int - s3Endpoint string + dataDir string + ctx context.Context + cancel context.CancelFunc + s3Client *s3.S3 + isRunning bool + startOnce sync.Once + wg sync.WaitGroup + masterPort int + volumePort int + filerPort int + s3Port int + s3Endpoint string + rustVolumeCmd *exec.Cmd } // TestS3Integration demonstrates basic S3 operations against a running weed mini instance @@ -236,6 +239,14 @@ func startMiniCluster(t *testing.T, extraArgs ...string) (*TestCluster, error) { return nil, fmt.Errorf("S3 service failed to start: %v", err) } + // If VOLUME_SERVER_IMPL=rust, start a Rust volume server alongside weed mini + if os.Getenv("VOLUME_SERVER_IMPL") == "rust" { + if err := cluster.startRustVolumeServer(t); err != nil { + cancel() + return nil, fmt.Errorf("failed to start Rust volume server: %v", err) + } + } + cluster.isRunning = true // Create S3 client @@ -257,8 +268,82 @@ func startMiniCluster(t *testing.T, extraArgs ...string) (*TestCluster, error) { return cluster, nil } +// startRustVolumeServer starts a Rust volume server that registers with the same master. +func (c *TestCluster) startRustVolumeServer(t *testing.T) error { + t.Helper() + + rustBinary, err := framework.FindOrBuildRustBinary() + if err != nil { + return fmt.Errorf("resolve rust volume binary: %v", err) + } + + rustVolumePort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume port: %v", err) + } + rustVolumeGrpcPort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume grpc port: %v", err) + } + + rustVolumeDir := filepath.Join(c.dataDir, "rust-volume") + if err := os.MkdirAll(rustVolumeDir, 0o755); err != nil { + return fmt.Errorf("create rust volume dir: %v", err) + } + + securityToml := filepath.Join(c.dataDir, "security.toml") + + args := []string{ + "--port", strconv.Itoa(rustVolumePort), + "--port.grpc", strconv.Itoa(rustVolumeGrpcPort), + "--port.public", strconv.Itoa(rustVolumePort), + "--ip", "127.0.0.1", + "--ip.bind", "127.0.0.1", + "--dir", rustVolumeDir, + "--max", "16", + "--master", "127.0.0.1:" + strconv.Itoa(c.masterPort), + "--securityFile", securityToml, + "--preStopSeconds", "0", + } + + logFile, err := os.Create(filepath.Join(c.dataDir, "rust-volume.log")) + if err != nil { + return fmt.Errorf("create rust volume log: %v", err) + } + + c.rustVolumeCmd = exec.Command(rustBinary, args...) + c.rustVolumeCmd.Dir = c.dataDir + c.rustVolumeCmd.Stdout = logFile + c.rustVolumeCmd.Stderr = logFile + if err := c.rustVolumeCmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("start rust volume: %v", err) + } + + // Wait for the Rust volume server to be ready + rustEndpoint := fmt.Sprintf("http://127.0.0.1:%d/healthz", rustVolumePort) + deadline := time.Now().Add(15 * time.Second) + client := &http.Client{Timeout: 1 * time.Second} + for time.Now().Before(deadline) { + resp, err := client.Get(rustEndpoint) + if err == nil { + resp.Body.Close() + t.Logf("Rust volume server ready on port %d (grpc %d)", rustVolumePort, rustVolumeGrpcPort) + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("rust volume server not ready after 15s (port %d)", rustVolumePort) +} + // Stop stops the test cluster func (c *TestCluster) Stop() { + // Stop Rust volume server first + if c.rustVolumeCmd != nil && c.rustVolumeCmd.Process != nil { + c.rustVolumeCmd.Process.Kill() + c.rustVolumeCmd.Wait() + } + if c.cancel != nil { c.cancel() } diff --git a/test/s3/policy/policy_test.go b/test/s3/policy/policy_test.go index 07092e04f..8c97f4a58 100644 --- a/test/s3/policy/policy_test.go +++ b/test/s3/policy/policy_test.go @@ -21,6 +21,7 @@ import ( "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/iam" "github.com/aws/aws-sdk-go/service/s3" + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" "github.com/seaweedfs/seaweedfs/weed/command" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb" @@ -42,6 +43,7 @@ type TestCluster struct { filerGrpcPort int s3Port int s3Endpoint string + rustVolumeCmd *exec.Cmd } func TestS3PolicyShellRevised(t *testing.T) { @@ -822,6 +824,15 @@ enabled = true cancel() return nil, err } + + // If VOLUME_SERVER_IMPL=rust, start a Rust volume server alongside weed mini + if os.Getenv("VOLUME_SERVER_IMPL") == "rust" { + if err := cluster.startRustVolumeServer(t); err != nil { + cancel() + return nil, fmt.Errorf("failed to start Rust volume server: %v", err) + } + } + cluster.isRunning = true return cluster, nil } @@ -840,7 +851,80 @@ func waitForS3Ready(endpoint string, timeout time.Duration) error { return fmt.Errorf("timeout waiting for S3") } +// startRustVolumeServer starts a Rust volume server that registers with the same master. +func (c *TestCluster) startRustVolumeServer(t *testing.T) error { + t.Helper() + + rustBinary, err := framework.FindOrBuildRustBinary() + if err != nil { + return fmt.Errorf("resolve rust volume binary: %v", err) + } + + rustVolumePort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume port: %v", err) + } + rustVolumeGrpcPort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume grpc port: %v", err) + } + + rustVolumeDir := filepath.Join(c.dataDir, "rust-volume") + if err := os.MkdirAll(rustVolumeDir, 0o755); err != nil { + return fmt.Errorf("create rust volume dir: %v", err) + } + + securityToml := filepath.Join(c.dataDir, "security.toml") + + args := []string{ + "--port", strconv.Itoa(rustVolumePort), + "--port.grpc", strconv.Itoa(rustVolumeGrpcPort), + "--port.public", strconv.Itoa(rustVolumePort), + "--ip", "127.0.0.1", + "--ip.bind", "127.0.0.1", + "--dir", rustVolumeDir, + "--max", "16", + "--master", "127.0.0.1:" + strconv.Itoa(c.masterPort), + "--securityFile", securityToml, + "--preStopSeconds", "0", + } + + logFile, err := os.Create(filepath.Join(c.dataDir, "rust-volume.log")) + if err != nil { + return fmt.Errorf("create rust volume log: %v", err) + } + + c.rustVolumeCmd = exec.Command(rustBinary, args...) + c.rustVolumeCmd.Dir = c.dataDir + c.rustVolumeCmd.Stdout = logFile + c.rustVolumeCmd.Stderr = logFile + if err := c.rustVolumeCmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("start rust volume: %v", err) + } + + rustEndpoint := fmt.Sprintf("http://127.0.0.1:%d/healthz", rustVolumePort) + deadline := time.Now().Add(15 * time.Second) + client := &http.Client{Timeout: 1 * time.Second} + for time.Now().Before(deadline) { + resp, err := client.Get(rustEndpoint) + if err == nil { + resp.Body.Close() + t.Logf("Rust volume server ready on port %d (grpc %d)", rustVolumePort, rustVolumeGrpcPort) + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("rust volume server not ready after 15s (port %d)", rustVolumePort) +} + func (c *TestCluster) Stop() { + // Stop Rust volume server first + if c.rustVolumeCmd != nil && c.rustVolumeCmd.Process != nil { + c.rustVolumeCmd.Process.Kill() + c.rustVolumeCmd.Wait() + } + if c.cancel != nil { c.cancel() } diff --git a/test/volume_server/framework/cluster.go b/test/volume_server/framework/cluster.go index 4bb1b55d5..1f9d30740 100644 --- a/test/volume_server/framework/cluster.go +++ b/test/volume_server/framework/cluster.go @@ -27,6 +27,12 @@ const ( testVolumeSizeLimitMB = 32 ) +var ( + weedBinaryOnce sync.Once + weedBinaryPath string + weedBinaryErr error +) + // Cluster is a lightweight SeaweedFS master + one volume server test harness. type Cluster struct { testingTB testing.TB @@ -326,6 +332,13 @@ func writeSecurityConfig(configDir string, profile matrix.Profile) error { b.WriteString("\"\n") b.WriteString("expires_after_seconds = 60\n") } + if profile.EnableUIAccess { + if b.Len() > 0 { + b.WriteString("\n") + } + b.WriteString("[access]\n") + b.WriteString("ui = true\n") + } if b.Len() == 0 { b.WriteString("# optional security config generated for integration tests\n") } @@ -341,40 +354,43 @@ func FindOrBuildWeedBinary() (string, error) { return "", fmt.Errorf("WEED_BINARY is set but not executable: %s", fromEnv) } - repoRoot := "" - if _, file, _, ok := runtime.Caller(0); ok { - repoRoot = filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..", "..")) - candidate := filepath.Join(repoRoot, "weed", "weed") - if isExecutableFile(candidate) { - return candidate, nil + weedBinaryOnce.Do(func() { + repoRoot := "" + if _, file, _, ok := runtime.Caller(0); ok { + repoRoot = filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..", "..")) + } + if repoRoot == "" { + weedBinaryErr = errors.New("unable to detect repository root") + return } - } - if repoRoot == "" { - return "", errors.New("unable to detect repository root") - } + binDir := filepath.Join(os.TempDir(), "seaweedfs_volume_server_it_bin") + if err := os.MkdirAll(binDir, 0o755); err != nil { + weedBinaryErr = fmt.Errorf("create binary directory %s: %w", binDir, err) + return + } + binPath := filepath.Join(binDir, "weed") - binDir := filepath.Join(os.TempDir(), "seaweedfs_volume_server_it_bin") - if err := os.MkdirAll(binDir, 0o755); err != nil { - return "", fmt.Errorf("create binary directory %s: %w", binDir, err) - } - binPath := filepath.Join(binDir, "weed") - if isExecutableFile(binPath) { - return binPath, nil - } + cmd := exec.Command("go", "build", "-o", binPath, ".") + cmd.Dir = filepath.Join(repoRoot, "weed") + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &out + if err := cmd.Run(); err != nil { + weedBinaryErr = fmt.Errorf("build weed binary: %w\n%s", err, out.String()) + return + } + if !isExecutableFile(binPath) { + weedBinaryErr = fmt.Errorf("built weed binary is not executable: %s", binPath) + return + } + weedBinaryPath = binPath + }) - cmd := exec.Command("go", "build", "-o", binPath, ".") - cmd.Dir = filepath.Join(repoRoot, "weed") - var out bytes.Buffer - cmd.Stdout = &out - cmd.Stderr = &out - if err := cmd.Run(); err != nil { - return "", fmt.Errorf("build weed binary: %w\n%s", err, out.String()) + if weedBinaryErr != nil { + return "", weedBinaryErr } - if !isExecutableFile(binPath) { - return "", fmt.Errorf("built weed binary is not executable: %s", binPath) - } - return binPath, nil + return weedBinaryPath, nil } func isExecutableFile(path string) bool { diff --git a/test/volume_server/framework/cluster_dual.go b/test/volume_server/framework/cluster_dual.go index b068419c0..1e3a2554c 100644 --- a/test/volume_server/framework/cluster_dual.go +++ b/test/volume_server/framework/cluster_dual.go @@ -11,7 +11,7 @@ import ( type DualVolumeCluster = MultiVolumeCluster // StartDualVolumeCluster starts a cluster with 2 volume servers. -// Deprecated: Use StartMultiVolumeCluster(t, profile, 2) directly. -func StartDualVolumeCluster(t testing.TB, profile matrix.Profile) *DualVolumeCluster { - return StartMultiVolumeCluster(t, profile, 2) +// Deprecated: Use StartMultiVolumeClusterAuto(t, profile, 2) directly. +func StartDualVolumeCluster(t testing.TB, profile matrix.Profile) MultiCluster { + return StartMultiVolumeClusterAuto(t, profile, 2) } diff --git a/test/volume_server/framework/cluster_interface.go b/test/volume_server/framework/cluster_interface.go new file mode 100644 index 000000000..875e66675 --- /dev/null +++ b/test/volume_server/framework/cluster_interface.go @@ -0,0 +1,63 @@ +package framework + +import ( + "os" + "testing" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// TestCluster is the common interface for single-volume cluster harnesses. +// Both *Cluster (Go volume) and *RustCluster (Rust volume) satisfy it. +type TestCluster interface { + MasterAddress() string + VolumeAdminAddress() string + VolumePublicAddress() string + VolumeGRPCAddress() string + VolumeServerAddress() string + MasterURL() string + VolumeAdminURL() string + VolumePublicURL() string + BaseDir() string + Stop() +} + +func useRustVolumeServer() bool { + return os.Getenv("VOLUME_SERVER_IMPL") == "rust" +} + +// StartVolumeCluster starts a single-volume cluster using either the Go or +// Rust volume server, depending on the VOLUME_SERVER_IMPL environment variable. +// Set VOLUME_SERVER_IMPL=rust to use the Rust volume server. +func StartVolumeCluster(t testing.TB, profile matrix.Profile) TestCluster { + t.Helper() + if useRustVolumeServer() { + return StartRustVolumeCluster(t, profile) + } + return StartSingleVolumeCluster(t, profile) +} + +// MultiCluster is the common interface for multi-volume cluster harnesses. +// Both *MultiVolumeCluster (Go) and *RustMultiVolumeCluster (Rust) satisfy it. +type MultiCluster interface { + MasterAddress() string + MasterURL() string + BaseDir() string + VolumeAdminAddress(index int) string + VolumeAdminURL(index int) string + VolumePublicAddress(index int) string + VolumePublicURL(index int) string + VolumeGRPCAddress(index int) string + Stop() +} + +// StartMultiVolumeClusterAuto starts a multi-volume cluster using either Go or +// Rust volume servers, depending on the VOLUME_SERVER_IMPL environment variable. +// Set VOLUME_SERVER_IMPL=rust to use Rust volume servers. +func StartMultiVolumeClusterAuto(t testing.TB, profile matrix.Profile, count int) MultiCluster { + t.Helper() + if useRustVolumeServer() { + return StartRustMultiVolumeCluster(t, profile, count) + } + return StartMultiVolumeCluster(t, profile, count) +} diff --git a/test/volume_server/framework/cluster_interface_test.go b/test/volume_server/framework/cluster_interface_test.go new file mode 100644 index 000000000..58dceaf56 --- /dev/null +++ b/test/volume_server/framework/cluster_interface_test.go @@ -0,0 +1,20 @@ +package framework + +import "testing" + +func TestUseRustVolumeServer(t *testing.T) { + t.Setenv("VOLUME_SERVER_IMPL", "rust") + if !useRustVolumeServer() { + t.Fatalf("expected rust selection when VOLUME_SERVER_IMPL=rust") + } + + t.Setenv("VOLUME_SERVER_IMPL", "go") + if useRustVolumeServer() { + t.Fatalf("expected go selection when VOLUME_SERVER_IMPL=go") + } + + t.Setenv("VOLUME_SERVER_IMPL", "") + if useRustVolumeServer() { + t.Fatalf("expected go selection when VOLUME_SERVER_IMPL is unset") + } +} diff --git a/test/volume_server/framework/cluster_multi_rust.go b/test/volume_server/framework/cluster_multi_rust.go new file mode 100644 index 000000000..45b9572ae --- /dev/null +++ b/test/volume_server/framework/cluster_multi_rust.go @@ -0,0 +1,289 @@ +package framework + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strconv" + "sync" + "testing" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// RustMultiVolumeCluster wraps a Go master + multiple Rust volume servers +// for integration testing. It mirrors MultiVolumeCluster but uses the Rust +// volume binary instead of the Go weed binary for volume servers. +type RustMultiVolumeCluster struct { + testingTB testing.TB + profile matrix.Profile + + weedBinary string // Go weed binary (for the master) + rustVolumeBinary string // Rust volume binary + + baseDir string + configDir string + logsDir string + keepLogs bool + volumeServerCount int + + masterPort int + masterGrpcPort int + + volumePorts []int + volumeGrpcPorts []int + volumePubPorts []int + + masterCmd *exec.Cmd + volumeCmds []*exec.Cmd + + cleanupOnce sync.Once +} + +// StartRustMultiVolumeCluster starts a cluster with a Go master and the +// specified number of Rust volume servers. +func StartRustMultiVolumeCluster(t testing.TB, profile matrix.Profile, serverCount int) *RustMultiVolumeCluster { + t.Helper() + + if serverCount < 1 { + t.Fatalf("serverCount must be at least 1, got %d", serverCount) + } + + weedBinary, err := FindOrBuildWeedBinary() + if err != nil { + t.Fatalf("resolve weed binary: %v", err) + } + + rustBinary, err := FindOrBuildRustBinary() + if err != nil { + t.Fatalf("resolve rust volume binary: %v", err) + } + + baseDir, keepLogs, err := newWorkDir() + if err != nil { + t.Fatalf("create temp test directory: %v", err) + } + + configDir := filepath.Join(baseDir, "config") + logsDir := filepath.Join(baseDir, "logs") + masterDataDir := filepath.Join(baseDir, "master") + + // Create directories for master and all volume servers + dirs := []string{configDir, logsDir, masterDataDir} + for i := 0; i < serverCount; i++ { + dirs = append(dirs, filepath.Join(baseDir, fmt.Sprintf("volume%d", i))) + } + for _, dir := range dirs { + if mkErr := os.MkdirAll(dir, 0o755); mkErr != nil { + t.Fatalf("create %s: %v", dir, mkErr) + } + } + + if err = writeSecurityConfig(configDir, profile); err != nil { + t.Fatalf("write security config: %v", err) + } + + masterPort, masterGrpcPort, err := allocateMasterPortPair() + if err != nil { + t.Fatalf("allocate master port pair: %v", err) + } + + // Allocate ports for all volume servers (3 ports per server: admin, grpc, public) + // If SplitPublicPort is true, we need an additional port per server + portsPerServer := 3 + if profile.SplitPublicPort { + portsPerServer = 4 + } + totalPorts := serverCount * portsPerServer + ports, err := allocatePorts(totalPorts) + if err != nil { + t.Fatalf("allocate volume ports: %v", err) + } + + c := &RustMultiVolumeCluster{ + testingTB: t, + profile: profile, + weedBinary: weedBinary, + rustVolumeBinary: rustBinary, + baseDir: baseDir, + configDir: configDir, + logsDir: logsDir, + keepLogs: keepLogs, + volumeServerCount: serverCount, + masterPort: masterPort, + masterGrpcPort: masterGrpcPort, + volumePorts: make([]int, serverCount), + volumeGrpcPorts: make([]int, serverCount), + volumePubPorts: make([]int, serverCount), + volumeCmds: make([]*exec.Cmd, serverCount), + } + + // Assign ports to each volume server + for i := 0; i < serverCount; i++ { + baseIdx := i * portsPerServer + c.volumePorts[i] = ports[baseIdx] + c.volumeGrpcPorts[i] = ports[baseIdx+1] + + // Assign public port, using baseIdx+3 if SplitPublicPort, else baseIdx+2 + pubPortIdx := baseIdx + 2 + if profile.SplitPublicPort { + pubPortIdx = baseIdx + 3 + } + c.volumePubPorts[i] = ports[pubPortIdx] + } + + // Start master (Go) + if err = c.startMaster(masterDataDir); err != nil { + c.Stop() + t.Fatalf("start master: %v", err) + } + helper := &Cluster{logsDir: logsDir} + if err = helper.waitForHTTP(c.MasterURL() + "/dir/status"); err != nil { + masterLog := helper.tailLog("master.log") + c.Stop() + t.Fatalf("wait for master readiness: %v\nmaster log tail:\n%s", err, masterLog) + } + + // Start all Rust volume servers + for i := 0; i < serverCount; i++ { + volumeDataDir := filepath.Join(baseDir, fmt.Sprintf("volume%d", i)) + if err = c.startRustVolume(i, volumeDataDir); err != nil { + volumeLog := fmt.Sprintf("volume%d.log", i) + c.Stop() + t.Fatalf("start rust volume server %d: %v\nvolume log tail:\n%s", i, err, helper.tailLog(volumeLog)) + } + if err = helper.waitForHTTP(c.VolumeAdminURL(i) + "/healthz"); err != nil { + volumeLog := fmt.Sprintf("volume%d.log", i) + c.Stop() + t.Fatalf("wait for rust volume server %d readiness: %v\nvolume log tail:\n%s", i, err, helper.tailLog(volumeLog)) + } + if err = helper.waitForTCP(c.VolumeGRPCAddress(i)); err != nil { + volumeLog := fmt.Sprintf("volume%d.log", i) + c.Stop() + t.Fatalf("wait for rust volume server %d grpc readiness: %v\nvolume log tail:\n%s", i, err, helper.tailLog(volumeLog)) + } + } + + t.Cleanup(func() { + c.Stop() + }) + + return c +} + +func (c *RustMultiVolumeCluster) Stop() { + if c == nil { + return + } + c.cleanupOnce.Do(func() { + // Stop volume servers in reverse order + for i := len(c.volumeCmds) - 1; i >= 0; i-- { + stopProcess(c.volumeCmds[i]) + } + stopProcess(c.masterCmd) + if !c.keepLogs && !c.testingTB.Failed() { + _ = os.RemoveAll(c.baseDir) + } else if c.baseDir != "" { + c.testingTB.Logf("rust multi volume server integration logs kept at %s", c.baseDir) + } + }) +} + +func (c *RustMultiVolumeCluster) startMaster(dataDir string) error { + logFile, err := os.Create(filepath.Join(c.logsDir, "master.log")) + if err != nil { + return err + } + + args := []string{ + "-config_dir=" + c.configDir, + "master", + "-ip=127.0.0.1", + "-port=" + strconv.Itoa(c.masterPort), + "-port.grpc=" + strconv.Itoa(c.masterGrpcPort), + "-mdir=" + dataDir, + "-peers=none", + "-volumeSizeLimitMB=" + strconv.Itoa(testVolumeSizeLimitMB), + "-defaultReplication=000", + } + + c.masterCmd = exec.Command(c.weedBinary, args...) + c.masterCmd.Dir = c.baseDir + c.masterCmd.Stdout = logFile + c.masterCmd.Stderr = logFile + return c.masterCmd.Start() +} + +func (c *RustMultiVolumeCluster) startRustVolume(index int, dataDir string) error { + logName := fmt.Sprintf("volume%d.log", index) + logFile, err := os.Create(filepath.Join(c.logsDir, logName)) + if err != nil { + return err + } + + args := rustVolumeArgs( + c.profile, + c.configDir, + c.masterPort, + c.volumePorts[index], + c.volumeGrpcPorts[index], + c.volumePubPorts[index], + dataDir, + ) + + cmd := exec.Command(c.rustVolumeBinary, args...) + cmd.Dir = c.baseDir + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err = cmd.Start(); err != nil { + return err + } + c.volumeCmds[index] = cmd + return nil +} + +// --- accessor methods (mirror MultiVolumeCluster) --- + +func (c *RustMultiVolumeCluster) MasterAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.masterPort)) +} + +func (c *RustMultiVolumeCluster) MasterURL() string { + return "http://" + c.MasterAddress() +} + +func (c *RustMultiVolumeCluster) VolumeAdminAddress(index int) string { + if index < 0 || index >= len(c.volumePorts) { + return "" + } + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumePorts[index])) +} + +func (c *RustMultiVolumeCluster) VolumePublicAddress(index int) string { + if index < 0 || index >= len(c.volumePubPorts) { + return "" + } + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumePubPorts[index])) +} + +func (c *RustMultiVolumeCluster) VolumeGRPCAddress(index int) string { + if index < 0 || index >= len(c.volumeGrpcPorts) { + return "" + } + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumeGrpcPorts[index])) +} + +func (c *RustMultiVolumeCluster) VolumeAdminURL(index int) string { + return "http://" + c.VolumeAdminAddress(index) +} + +func (c *RustMultiVolumeCluster) VolumePublicURL(index int) string { + return "http://" + c.VolumePublicAddress(index) +} + +func (c *RustMultiVolumeCluster) BaseDir() string { + return c.baseDir +} diff --git a/test/volume_server/framework/cluster_rust.go b/test/volume_server/framework/cluster_rust.go new file mode 100644 index 000000000..5d5f56a14 --- /dev/null +++ b/test/volume_server/framework/cluster_rust.go @@ -0,0 +1,342 @@ +package framework + +import ( + "bytes" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "sync" + "testing" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// RustCluster wraps a Go master + Rust volume server for integration testing. +type RustCluster struct { + testingTB testing.TB + profile matrix.Profile + + weedBinary string // Go weed binary (for the master) + rustVolumeBinary string // Rust volume binary + + baseDir string + configDir string + logsDir string + keepLogs bool + + masterPort int + masterGrpcPort int + volumePort int + volumeGrpcPort int + volumePubPort int + + masterCmd *exec.Cmd + volumeCmd *exec.Cmd + + cleanupOnce sync.Once +} + +var ( + rustBinaryOnce sync.Once + rustBinaryPath string + rustBinaryErr error +) + +// StartRustVolumeCluster starts a Go master + Rust volume server. +func StartRustVolumeCluster(t testing.TB, profile matrix.Profile) *RustCluster { + t.Helper() + + weedBinary, err := FindOrBuildWeedBinary() + if err != nil { + t.Fatalf("resolve weed binary: %v", err) + } + + rustBinary, err := FindOrBuildRustBinary() + if err != nil { + t.Fatalf("resolve rust volume binary: %v", err) + } + + baseDir, keepLogs, err := newWorkDir() + if err != nil { + t.Fatalf("create temp test directory: %v", err) + } + + configDir := filepath.Join(baseDir, "config") + logsDir := filepath.Join(baseDir, "logs") + masterDataDir := filepath.Join(baseDir, "master") + volumeDataDir := filepath.Join(baseDir, "volume") + for _, dir := range []string{configDir, logsDir, masterDataDir, volumeDataDir} { + if mkErr := os.MkdirAll(dir, 0o755); mkErr != nil { + t.Fatalf("create %s: %v", dir, mkErr) + } + } + + if err = writeSecurityConfig(configDir, profile); err != nil { + t.Fatalf("write security config: %v", err) + } + + masterPort, masterGrpcPort, err := allocateMasterPortPair() + if err != nil { + t.Fatalf("allocate master port pair: %v", err) + } + + ports, err := allocatePorts(3) + if err != nil { + t.Fatalf("allocate ports: %v", err) + } + + rc := &RustCluster{ + testingTB: t, + profile: profile, + weedBinary: weedBinary, + rustVolumeBinary: rustBinary, + baseDir: baseDir, + configDir: configDir, + logsDir: logsDir, + keepLogs: keepLogs, + masterPort: masterPort, + masterGrpcPort: masterGrpcPort, + volumePort: ports[0], + volumeGrpcPort: ports[1], + volumePubPort: ports[0], + } + if profile.SplitPublicPort { + rc.volumePubPort = ports[2] + } + + if err = rc.startMaster(masterDataDir); err != nil { + rc.Stop() + t.Fatalf("start master: %v", err) + } + // Reuse the same HTTP readiness helper via an unexported Cluster shim. + helper := &Cluster{logsDir: logsDir} + if err = helper.waitForHTTP(rc.MasterURL() + "/dir/status"); err != nil { + masterLog := helper.tailLog("master.log") + rc.Stop() + t.Fatalf("wait for master readiness: %v\nmaster log tail:\n%s", err, masterLog) + } + + if err = rc.startRustVolume(volumeDataDir); err != nil { + masterLog := helper.tailLog("master.log") + rc.Stop() + t.Fatalf("start rust volume: %v\nmaster log tail:\n%s", err, masterLog) + } + if err = helper.waitForHTTP(rc.VolumeAdminURL() + "/healthz"); err != nil { + volumeLog := helper.tailLog("volume.log") + rc.Stop() + t.Fatalf("wait for rust volume readiness: %v\nvolume log tail:\n%s", err, volumeLog) + } + if err = helper.waitForTCP(rc.VolumeGRPCAddress()); err != nil { + volumeLog := helper.tailLog("volume.log") + rc.Stop() + t.Fatalf("wait for rust volume grpc readiness: %v\nvolume log tail:\n%s", err, volumeLog) + } + + t.Cleanup(func() { + rc.Stop() + }) + + return rc +} + +// Stop terminates all processes and cleans temporary files. +func (rc *RustCluster) Stop() { + if rc == nil { + return + } + rc.cleanupOnce.Do(func() { + stopProcess(rc.volumeCmd) + stopProcess(rc.masterCmd) + if !rc.keepLogs && !rc.testingTB.Failed() { + _ = os.RemoveAll(rc.baseDir) + } else if rc.baseDir != "" { + rc.testingTB.Logf("rust volume server integration logs kept at %s", rc.baseDir) + } + }) +} + +func (rc *RustCluster) startMaster(dataDir string) error { + logFile, err := os.Create(filepath.Join(rc.logsDir, "master.log")) + if err != nil { + return err + } + + args := []string{ + "-config_dir=" + rc.configDir, + "master", + "-ip=127.0.0.1", + "-port=" + strconv.Itoa(rc.masterPort), + "-port.grpc=" + strconv.Itoa(rc.masterGrpcPort), + "-mdir=" + dataDir, + "-peers=none", + "-volumeSizeLimitMB=" + strconv.Itoa(testVolumeSizeLimitMB), + "-defaultReplication=000", + } + + rc.masterCmd = exec.Command(rc.weedBinary, args...) + rc.masterCmd.Dir = rc.baseDir + rc.masterCmd.Stdout = logFile + rc.masterCmd.Stderr = logFile + return rc.masterCmd.Start() +} + +func rustVolumeArgs( + profile matrix.Profile, + configDir string, + masterPort int, + volumePort int, + volumeGrpcPort int, + volumePubPort int, + dataDir string, +) []string { + args := []string{ + "--port", strconv.Itoa(volumePort), + "--port.grpc", strconv.Itoa(volumeGrpcPort), + "--port.public", strconv.Itoa(volumePubPort), + "--ip", "127.0.0.1", + "--ip.bind", "127.0.0.1", + "--dir", dataDir, + "--max", "16", + "--master", "127.0.0.1:" + strconv.Itoa(masterPort), + "--securityFile", filepath.Join(configDir, "security.toml"), + "--readMode", profile.ReadMode, + "--concurrentUploadLimitMB", strconv.Itoa(profile.ConcurrentUploadLimitMB), + "--concurrentDownloadLimitMB", strconv.Itoa(profile.ConcurrentDownloadLimitMB), + "--preStopSeconds", "0", + } + if profile.InflightUploadTimeout > 0 { + args = append(args, "--inflightUploadDataTimeout", profile.InflightUploadTimeout.String()) + } + if profile.InflightDownloadTimeout > 0 { + args = append(args, "--inflightDownloadDataTimeout", profile.InflightDownloadTimeout.String()) + } + return args +} + +func (rc *RustCluster) startRustVolume(dataDir string) error { + logFile, err := os.Create(filepath.Join(rc.logsDir, "volume.log")) + if err != nil { + return err + } + + args := rustVolumeArgs( + rc.profile, + rc.configDir, + rc.masterPort, + rc.volumePort, + rc.volumeGrpcPort, + rc.volumePubPort, + dataDir, + ) + + rc.volumeCmd = exec.Command(rc.rustVolumeBinary, args...) + rc.volumeCmd.Dir = rc.baseDir + rc.volumeCmd.Stdout = logFile + rc.volumeCmd.Stderr = logFile + return rc.volumeCmd.Start() +} + +// FindOrBuildRustBinary returns an executable Rust volume binary, building one when needed. +func FindOrBuildRustBinary() (string, error) { + if fromEnv := os.Getenv("RUST_VOLUME_BINARY"); fromEnv != "" { + if isExecutableFile(fromEnv) { + return fromEnv, nil + } + return "", fmt.Errorf("RUST_VOLUME_BINARY is set but not executable: %s", fromEnv) + } + + rustBinaryOnce.Do(func() { + // Derive the Rust volume crate directory from this source file's location. + rustCrateDir := "" + if _, file, _, ok := runtime.Caller(0); ok { + repoRoot := filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..", "..")) + for _, candidate := range []string{"seaweed-volume", "weed-volume"} { + dir := filepath.Join(repoRoot, candidate) + if isDir(dir) && isFile(filepath.Join(dir, "Cargo.toml")) { + rustCrateDir = dir + break + } + } + } + if rustCrateDir == "" { + rustBinaryErr = fmt.Errorf("unable to detect Rust volume crate directory") + return + } + + releaseBin := filepath.Join(rustCrateDir, "target", "release", "weed-volume") + + // Always rebuild once per test process so the harness uses current source and features. + cmd := exec.Command("cargo", "build", "--release") + cmd.Dir = rustCrateDir + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &out + if err := cmd.Run(); err != nil { + rustBinaryErr = fmt.Errorf("build rust volume binary: %w\n%s", err, out.String()) + return + } + if !isExecutableFile(releaseBin) { + rustBinaryErr = fmt.Errorf("built rust volume binary is not executable: %s", releaseBin) + return + } + rustBinaryPath = releaseBin + }) + + if rustBinaryErr != nil { + return "", rustBinaryErr + } + return rustBinaryPath, nil +} + +func isDir(path string) bool { + info, err := os.Stat(path) + return err == nil && info.IsDir() +} + +func isFile(path string) bool { + info, err := os.Stat(path) + return err == nil && info.Mode().IsRegular() +} + +// --- accessor methods (mirror Cluster) --- + +func (rc *RustCluster) MasterAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.masterPort)) +} + +func (rc *RustCluster) VolumeAdminAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.volumePort)) +} + +func (rc *RustCluster) VolumePublicAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.volumePubPort)) +} + +func (rc *RustCluster) VolumeGRPCAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.volumeGrpcPort)) +} + +// VolumeServerAddress returns SeaweedFS server address format: ip:httpPort.grpcPort +func (rc *RustCluster) VolumeServerAddress() string { + return fmt.Sprintf("%s.%d", rc.VolumeAdminAddress(), rc.volumeGrpcPort) +} + +func (rc *RustCluster) MasterURL() string { + return "http://" + rc.MasterAddress() +} + +func (rc *RustCluster) VolumeAdminURL() string { + return "http://" + rc.VolumeAdminAddress() +} + +func (rc *RustCluster) VolumePublicURL() string { + return "http://" + rc.VolumePublicAddress() +} + +func (rc *RustCluster) BaseDir() string { + return rc.baseDir +} diff --git a/test/volume_server/framework/cluster_rust_test.go b/test/volume_server/framework/cluster_rust_test.go new file mode 100644 index 000000000..f2558753a --- /dev/null +++ b/test/volume_server/framework/cluster_rust_test.go @@ -0,0 +1,38 @@ +package framework + +import ( + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +func TestRustVolumeArgsIncludeReadMode(t *testing.T) { + profile := matrix.P1() + profile.ReadMode = "redirect" + profile.ConcurrentUploadLimitMB = 7 + profile.ConcurrentDownloadLimitMB = 9 + profile.InflightUploadTimeout = 3 * time.Second + profile.InflightDownloadTimeout = 4 * time.Second + + args := rustVolumeArgs(profile, "/tmp/config", 9333, 18080, 28080, 38080, "/tmp/data") + + assertArgPair(t, args, "--readMode", "redirect") + assertArgPair(t, args, "--concurrentUploadLimitMB", "7") + assertArgPair(t, args, "--concurrentDownloadLimitMB", "9") + assertArgPair(t, args, "--inflightUploadDataTimeout", "3s") + assertArgPair(t, args, "--inflightDownloadDataTimeout", "4s") +} + +func assertArgPair(t *testing.T, args []string, flag string, want string) { + t.Helper() + for i := 0; i+1 < len(args); i += 2 { + if args[i] == flag { + if args[i+1] != want { + t.Fatalf("%s value mismatch: got %q want %q", flag, args[i+1], want) + } + return + } + } + t.Fatalf("missing %s in args: %v", flag, args) +} diff --git a/test/volume_server/grpc/admin_extra_test.go b/test/volume_server/grpc/admin_extra_test.go index de62fcdb8..85afa1ade 100644 --- a/test/volume_server/grpc/admin_extra_test.go +++ b/test/volume_server/grpc/admin_extra_test.go @@ -2,6 +2,7 @@ package volume_server_grpc_test import ( "context" + "io" "net/http" "strings" "testing" @@ -18,7 +19,7 @@ func TestVolumeNeedleStatusForUploadedFile(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -57,12 +58,57 @@ func TestVolumeNeedleStatusForUploadedFile(t *testing.T) { } } +func TestVolumeNeedleStatusIncludesTtlAndLastModified(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(27) + const needleID = uint64(778901) + const cookie = uint32(0xA1B2C3D6) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, needleID, cookie) + client := framework.NewHTTPClient() + uploadReq := mustNewRequest(t, http.MethodPost, clusterHarness.VolumeAdminURL()+"/"+fid+"?ttl=7d&ts=1700000000") + uploadReq.Body = io.NopCloser(strings.NewReader("needle-status-ttl-payload")) + uploadReq.ContentLength = int64(len("needle-status-ttl-payload")) + uploadReq.Header.Set("Content-Type", "application/octet-stream") + uploadResp := framework.DoRequest(t, client, uploadReq) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload status: expected 201, got %d", uploadResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + statusResp, err := grpcClient.VolumeNeedleStatus(ctx, &volume_server_pb.VolumeNeedleStatusRequest{ + VolumeId: volumeID, + NeedleId: needleID, + }) + if err != nil { + t.Fatalf("VolumeNeedleStatus with ttl failed: %v", err) + } + // Go's ReadTTL normalizes via fitTtlCount: 7d → 1w (7 days = 1 week) + if statusResp.GetTtl() != "1w" { + t.Fatalf("ttl mismatch: got %q want %q", statusResp.GetTtl(), "1w") + } + if statusResp.GetLastModified() != 1700000000 { + t.Fatalf("last modified mismatch: got %d want %d", statusResp.GetLastModified(), 1700000000) + } +} + func TestVolumeNeedleStatusViaEcShardsWhenNormalVolumeUnmounted(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -138,7 +184,7 @@ func TestVolumeNeedleStatusMissingVolumeAndNeedle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -185,7 +231,7 @@ func TestVolumeConfigureInvalidReplication(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -215,7 +261,7 @@ func TestVolumeConfigureSuccessAndMissingRollbackPath(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -268,7 +314,7 @@ func TestPingVolumeTargetAndLeaveAffectsHealthz(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -311,7 +357,7 @@ func TestVolumeServerLeaveIsIdempotent(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -346,7 +392,7 @@ func TestPingUnknownAndUnreachableTargetPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -367,6 +413,17 @@ func TestPingUnknownAndUnreachableTargetPaths(t *testing.T) { t.Fatalf("Ping unknown target type expected stop_time_ns >= start_time_ns") } + emptyTargetResp, err := grpcClient.Ping(ctx, &volume_server_pb.PingRequest{}) + if err != nil { + t.Fatalf("Ping empty target should not return grpc error, got: %v", err) + } + if emptyTargetResp.GetRemoteTimeNs() != 0 { + t.Fatalf("Ping empty target expected remote_time_ns=0, got %d", emptyTargetResp.GetRemoteTimeNs()) + } + if emptyTargetResp.GetStopTimeNs() < emptyTargetResp.GetStartTimeNs() { + t.Fatalf("Ping empty target expected stop_time_ns >= start_time_ns") + } + _, err = grpcClient.Ping(ctx, &volume_server_pb.PingRequest{ TargetType: cluster.MasterType, Target: "127.0.0.1:1", @@ -395,7 +452,7 @@ func TestPingMasterTargetSuccess(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/admin_lifecycle_test.go b/test/volume_server/grpc/admin_lifecycle_test.go index bdc4e5a45..7ec5d64ef 100644 --- a/test/volume_server/grpc/admin_lifecycle_test.go +++ b/test/volume_server/grpc/admin_lifecycle_test.go @@ -19,7 +19,7 @@ func TestVolumeAdminLifecycleRPCs(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -62,7 +62,7 @@ func TestVolumeDeleteOnlyEmptyVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -101,7 +101,7 @@ func TestMaintenanceModeRejectsAllocateVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -133,7 +133,7 @@ func TestAllocateDuplicateAndMountUnmountMissingVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -184,7 +184,7 @@ func TestMaintenanceModeRejectsVolumeDelete(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/admin_readonly_collection_test.go b/test/volume_server/grpc/admin_readonly_collection_test.go index 36d2d2f31..5f9679f2e 100644 --- a/test/volume_server/grpc/admin_readonly_collection_test.go +++ b/test/volume_server/grpc/admin_readonly_collection_test.go @@ -16,7 +16,7 @@ func TestVolumeMarkReadonlyAndWritableLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -61,7 +61,7 @@ func TestVolumeMarkReadonlyPersistTrue(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -93,7 +93,7 @@ func TestVolumeMarkReadonlyWritableErrorPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -146,7 +146,7 @@ func TestDeleteCollectionRemovesVolumeAndIsIdempotent(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/batch_delete_test.go b/test/volume_server/grpc/batch_delete_test.go index b02d4ea27..4fc822fee 100644 --- a/test/volume_server/grpc/batch_delete_test.go +++ b/test/volume_server/grpc/batch_delete_test.go @@ -18,7 +18,7 @@ func TestBatchDeleteInvalidFidAndMaintenanceMode(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -61,7 +61,7 @@ func TestBatchDeleteCookieMismatchAndSkipCheck(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -124,7 +124,7 @@ func TestBatchDeleteMixedStatusesAndMismatchStopsProcessing(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -219,7 +219,7 @@ func TestBatchDeleteRejectsChunkManifestNeedles(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/copy_receive_variants_test.go b/test/volume_server/grpc/copy_receive_variants_test.go index 14d9cee72..3a82822b2 100644 --- a/test/volume_server/grpc/copy_receive_variants_test.go +++ b/test/volume_server/grpc/copy_receive_variants_test.go @@ -4,6 +4,7 @@ import ( "context" "io" "math" + "net/http" "strings" "testing" "time" @@ -18,7 +19,7 @@ func TestVolumeIncrementalCopyDataAndNoDataPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -77,7 +78,7 @@ func TestCopyFileIgnoreNotFoundAndStopOffsetZeroPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -132,12 +133,70 @@ func TestCopyFileIgnoreNotFoundAndStopOffsetZeroPaths(t *testing.T) { } } +func TestCopyFileStopOffsetZeroExistingFileSendsMetadata(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(93) + const needleID = uint64(770101) + const cookie = uint32(0x1234ABCD) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + uploadResp := framework.UploadBytes( + t, + client, + clusterHarness.VolumeAdminURL(), + framework.NewFileID(volumeID, needleID, cookie), + []byte("copy-file-stop-zero"), + ) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + stream, err := grpcClient.CopyFile(ctx, &volume_server_pb.CopyFileRequest{ + VolumeId: volumeID, + Ext: ".dat", + CompactionRevision: math.MaxUint32, + StopOffset: 0, + IgnoreSourceFileNotFound: false, + }) + if err != nil { + t.Fatalf("CopyFile stop_offset=0 existing file start failed: %v", err) + } + + msg, err := stream.Recv() + if err != nil { + t.Fatalf("CopyFile stop_offset=0 existing file recv failed: %v", err) + } + if len(msg.GetFileContent()) != 0 { + t.Fatalf("CopyFile stop_offset=0 existing file should not send content, got %d bytes", len(msg.GetFileContent())) + } + if msg.GetModifiedTsNs() == 0 { + t.Fatalf("CopyFile stop_offset=0 existing file expected non-zero ModifiedTsNs") + } + + _, err = stream.Recv() + if err != io.EOF { + t.Fatalf("CopyFile stop_offset=0 existing file expected EOF after metadata frame, got: %v", err) + } +} + func TestCopyFileCompactionRevisionMismatch(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -166,7 +225,7 @@ func TestReceiveFileProtocolViolationResponses(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -213,7 +272,7 @@ func TestReceiveFileSuccessForRegularVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -299,7 +358,7 @@ func TestReceiveFileSuccessForEcVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -389,7 +448,7 @@ func TestCopyFileEcVolumeIgnoreMissingSourcePaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/copy_sync_test.go b/test/volume_server/grpc/copy_sync_test.go index 3c2916fd0..810395fb6 100644 --- a/test/volume_server/grpc/copy_sync_test.go +++ b/test/volume_server/grpc/copy_sync_test.go @@ -18,12 +18,19 @@ func TestVolumeSyncStatusAndReadVolumeFileStatus(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() + httpClient := framework.NewHTTPClient() const volumeID = uint32(41) framework.AllocateVolume(t, grpcClient, volumeID, "") + fid := framework.NewFileID(volumeID, 1, 0x11112222) + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fid, []byte("sync-status-payload")) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -46,6 +53,12 @@ func TestVolumeSyncStatusAndReadVolumeFileStatus(t *testing.T) { if statusResp.GetVersion() == 0 { t.Fatalf("ReadVolumeFileStatus expected non-zero version") } + if syncResp.GetTailOffset() == 0 { + t.Fatalf("VolumeSyncStatus expected non-zero tail offset after upload") + } + if syncResp.GetTailOffset() != statusResp.GetDatFileSize() { + t.Fatalf("VolumeSyncStatus tail offset mismatch: got %d want %d", syncResp.GetTailOffset(), statusResp.GetDatFileSize()) + } } func TestCopyAndStreamMethodsMissingVolumePaths(t *testing.T) { @@ -53,7 +66,7 @@ func TestCopyAndStreamMethodsMissingVolumePaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -100,7 +113,7 @@ func TestVolumeCopyAndReceiveFileMaintenanceRejection(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/data_rw_test.go b/test/volume_server/grpc/data_rw_test.go index 43969532d..b7701b9b2 100644 --- a/test/volume_server/grpc/data_rw_test.go +++ b/test/volume_server/grpc/data_rw_test.go @@ -16,7 +16,7 @@ func TestReadNeedleBlobAndMetaMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -54,7 +54,7 @@ func TestWriteNeedleBlobMaintenanceAndMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -104,7 +104,7 @@ func TestReadNeedleBlobAndMetaInvalidOffsets(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/data_stream_success_test.go b/test/volume_server/grpc/data_stream_success_test.go index 90f2a8248..4297068a3 100644 --- a/test/volume_server/grpc/data_stream_success_test.go +++ b/test/volume_server/grpc/data_stream_success_test.go @@ -3,6 +3,8 @@ package volume_server_grpc_test import ( "context" "io" + "net/http" + "reflect" "strings" "testing" "time" @@ -19,7 +21,7 @@ func TestReadWriteNeedleBlobAndMetaRoundTrip(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -122,7 +124,7 @@ func TestReadAllNeedlesStreamsUploadedRecords(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -180,7 +182,7 @@ func TestReadAllNeedlesExistingThenMissingVolumeAbortsStream(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -230,6 +232,139 @@ func TestReadAllNeedlesExistingThenMissingVolumeAbortsStream(t *testing.T) { } } +func TestReadAllNeedlesPreservesDatOrderAcrossOverwrite(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(86) + const firstNeedleID = uint64(444551) + const secondNeedleID = uint64(444552) + const firstCookie = uint32(0xAA22BB33) + const secondCookie = uint32(0xCC44DD55) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + uploads := []struct { + fid string + body string + }{ + {fid: framework.NewFileID(volumeID, firstNeedleID, firstCookie), body: "read-all-first"}, + {fid: framework.NewFileID(volumeID, secondNeedleID, secondCookie), body: "read-all-second"}, + {fid: framework.NewFileID(volumeID, firstNeedleID, firstCookie), body: "read-all-first-overwrite"}, + } + for _, upload := range uploads { + resp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), upload.fid, []byte(upload.body)) + _ = framework.ReadAllAndClose(t, resp) + if resp.StatusCode != 201 { + t.Fatalf("upload for %s expected 201, got %d", upload.fid, resp.StatusCode) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + stream, err := grpcClient.ReadAllNeedles(ctx, &volume_server_pb.ReadAllNeedlesRequest{VolumeIds: []uint32{volumeID}}) + if err != nil { + t.Fatalf("ReadAllNeedles start failed: %v", err) + } + + var orderedIDs []uint64 + var orderedBodies []string + for { + msg, recvErr := stream.Recv() + if recvErr == io.EOF { + break + } + if recvErr != nil { + t.Fatalf("ReadAllNeedles recv failed: %v", recvErr) + } + orderedIDs = append(orderedIDs, msg.GetNeedleId()) + orderedBodies = append(orderedBodies, string(msg.GetNeedleBlob())) + } + + wantIDs := []uint64{secondNeedleID, firstNeedleID} + wantBodies := []string{"read-all-second", "read-all-first-overwrite"} + if !reflect.DeepEqual(orderedIDs, wantIDs) { + t.Fatalf("ReadAllNeedles order mismatch: got %v want %v", orderedIDs, wantIDs) + } + if !reflect.DeepEqual(orderedBodies, wantBodies) { + t.Fatalf("ReadAllNeedles bodies mismatch: got %v want %v", orderedBodies, wantBodies) + } +} + +func TestReadNeedleMetaDeletedEntryUsesTombstoneMetadata(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(87) + const needleID = uint64(444661) + const cookie = uint32(0xAB12CD34) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, needleID, cookie) + uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), fid, []byte("read-meta-delete")) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + deleteReq, err := http.NewRequest(http.MethodDelete, clusterHarness.VolumeAdminURL()+"/"+fid, nil) + if err != nil { + t.Fatalf("build delete request: %v", err) + } + deleteResp := framework.DoRequest(t, client, deleteReq) + _ = framework.ReadAllAndClose(t, deleteResp) + if deleteResp.StatusCode != http.StatusAccepted { + t.Fatalf("delete expected 202, got %d", deleteResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + fileStatus, err := grpcClient.ReadVolumeFileStatus(ctx, &volume_server_pb.ReadVolumeFileStatusRequest{VolumeId: volumeID}) + if err != nil { + t.Fatalf("ReadVolumeFileStatus after delete failed: %v", err) + } + + idxBytes := copyFileBytes(t, grpcClient, &volume_server_pb.CopyFileRequest{ + VolumeId: volumeID, + Ext: ".idx", + CompactionRevision: fileStatus.GetCompactionRevision(), + StopOffset: fileStatus.GetIdxFileSize(), + }) + offset, size := findLastNeedleOffsetAndSize(t, idxBytes, needleID) + if size >= 0 { + t.Fatalf("expected deleted idx entry for needle %d, got size %d", needleID, size) + } + + metaResp, err := grpcClient.ReadNeedleMeta(ctx, &volume_server_pb.ReadNeedleMetaRequest{ + VolumeId: volumeID, + NeedleId: needleID, + Offset: offset, + Size: size, + }) + if err != nil { + t.Fatalf("ReadNeedleMeta deleted-entry failed: %v", err) + } + if metaResp.GetCookie() != cookie { + t.Fatalf("ReadNeedleMeta deleted-entry cookie mismatch: got %d want %d", metaResp.GetCookie(), cookie) + } + if metaResp.GetAppendAtNs() == 0 { + t.Fatalf("ReadNeedleMeta deleted-entry expected non-zero append_at_ns") + } +} + func copyFileBytes(t testing.TB, grpcClient volume_server_pb.VolumeServerClient, req *volume_server_pb.CopyFileRequest) []byte { t.Helper() @@ -271,3 +406,23 @@ func findNeedleOffsetAndSize(t testing.TB, idxBytes []byte, needleID uint64) (of t.Fatalf("needle id %d not found in idx entries", needleID) return 0, 0 } + +func findLastNeedleOffsetAndSize(t testing.TB, idxBytes []byte, needleID uint64) (offset int64, size int32) { + t.Helper() + + found := false + for i := 0; i+types.NeedleMapEntrySize <= len(idxBytes); i += types.NeedleMapEntrySize { + key, entryOffset, entrySize := idx.IdxFileEntry(idxBytes[i : i+types.NeedleMapEntrySize]) + if uint64(key) != needleID { + continue + } + found = true + offset = entryOffset.ToActualOffset() + size = int32(entrySize) + } + + if !found { + t.Fatalf("needle id %d not found in idx entries", needleID) + } + return offset, size +} diff --git a/test/volume_server/grpc/erasure_coding_test.go b/test/volume_server/grpc/erasure_coding_test.go index 8a0d8f75f..f5852c6f3 100644 --- a/test/volume_server/grpc/erasure_coding_test.go +++ b/test/volume_server/grpc/erasure_coding_test.go @@ -23,7 +23,7 @@ func TestEcMaintenanceModeRejections(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -92,7 +92,7 @@ func TestEcMissingInvalidAndNoopPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -199,7 +199,7 @@ func TestEcGenerateMountInfoUnmountLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -268,7 +268,7 @@ func TestEcShardReadAndBlobDeleteLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -369,7 +369,7 @@ func TestEcRebuildMissingShardLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -449,7 +449,7 @@ func TestEcShardsToVolumeMissingShardAndNoLiveEntries(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -560,7 +560,7 @@ func TestEcShardsToVolumeSuccessRoundTrip(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -621,7 +621,7 @@ func TestEcShardsDeleteLastShardRemovesEcx(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -757,7 +757,7 @@ func TestEcShardsCopyFailsWhenSourceUnavailable(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/fetch_remote_s3_test.go b/test/volume_server/grpc/fetch_remote_s3_test.go new file mode 100644 index 000000000..bd1c94cbc --- /dev/null +++ b/test/volume_server/grpc/fetch_remote_s3_test.go @@ -0,0 +1,288 @@ +package volume_server_grpc_test + +import ( + "bytes" + "context" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/remote_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" +) + +// findAvailablePort finds a free TCP port on localhost. +func findAvailablePort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + port := l.Addr().(*net.TCPAddr).Port + l.Close() + return port, nil +} + +// waitForPort waits until a TCP port is listening, up to timeout. +func waitForPort(port int, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + conn, err := net.DialTimeout("tcp", fmt.Sprintf("127.0.0.1:%d", port), 500*time.Millisecond) + if err == nil { + conn.Close() + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("port %d not listening after %v", port, timeout) +} + +// startWeedMini starts a weed mini subprocess and returns the S3 endpoint and cleanup func. +func startWeedMini(t *testing.T) (s3Endpoint string, cleanup func()) { + t.Helper() + + weedBin, err := exec.LookPath("weed") + if err != nil { + weedBin = filepath.Join("..", "..", "..", "weed", "weed_binary") + if _, err := os.Stat(weedBin); os.IsNotExist(err) { + t.Skip("weed binary not found, skipping S3 remote storage test") + } + } + + miniMasterPort, _ := findAvailablePort() + miniVolumePort, _ := findAvailablePort() + miniFilerPort, _ := findAvailablePort() + miniS3Port, _ := findAvailablePort() + miniDir := t.TempDir() + os.WriteFile(filepath.Join(miniDir, "security.toml"), []byte("# empty\n"), 0644) + + ctx, cancel := context.WithCancel(context.Background()) + + miniCmd := exec.CommandContext(ctx, weedBin, "mini", + fmt.Sprintf("-dir=%s", miniDir), + fmt.Sprintf("-master.port=%d", miniMasterPort), + fmt.Sprintf("-volume.port=%d", miniVolumePort), + fmt.Sprintf("-filer.port=%d", miniFilerPort), + fmt.Sprintf("-s3.port=%d", miniS3Port), + ) + miniCmd.Env = append(os.Environ(), "AWS_ACCESS_KEY_ID=admin", "AWS_SECRET_ACCESS_KEY=admin") + miniCmd.Dir = miniDir + logFile, _ := os.CreateTemp("", "weed-mini-*.log") + miniCmd.Stdout = logFile + miniCmd.Stderr = logFile + t.Logf("weed mini logs at %s", logFile.Name()) + + if err := miniCmd.Start(); err != nil { + cancel() + logFile.Close() + t.Fatalf("start weed mini: %v", err) + } + + if err := waitForPort(miniS3Port, 30*time.Second); err != nil { + cancel() + miniCmd.Wait() + logFile.Close() + t.Fatalf("weed mini S3 not ready: %v", err) + } + t.Logf("weed mini S3 ready on port %d", miniS3Port) + + return fmt.Sprintf("http://127.0.0.1:%d", miniS3Port), func() { + cancel() + miniCmd.Wait() + logFile.Close() + } +} + +func newS3Client(endpoint string) *s3.S3 { + sess, _ := session.NewSession(&aws.Config{ + Region: aws.String("us-east-1"), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials("admin", "admin", ""), + DisableSSL: aws.Bool(true), + S3ForcePathStyle: aws.Bool(true), + }) + return s3.New(sess) +} + +// TestFetchAndWriteNeedleFromS3 tests the full FetchAndWriteNeedle flow: +// 1. Start a weed mini instance as S3 backend +// 2. Upload a test object to it via S3 API +// 3. Call FetchAndWriteNeedle on the volume server to fetch from S3 +// 4. Verify the response contains a valid e_tag +func TestFetchAndWriteNeedleFromS3(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + s3Endpoint, cleanupMini := startWeedMini(t) + defer cleanupMini() + + s3Client := newS3Client(s3Endpoint) + + // Create bucket and upload test data + bucket := "test-remote-fetch" + s3Client.CreateBucket(&s3.CreateBucketInput{Bucket: aws.String(bucket)}) + + testData := []byte("Hello from S3 remote storage! This is test data for FetchAndWriteNeedle.") + testKey := "test-object.dat" + _, err := s3Client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(testKey), + Body: bytes.NewReader(testData), + }) + if err != nil { + t.Fatalf("put object: %v", err) + } + t.Logf("uploaded %d bytes to s3://%s/%s", len(testData), bucket, testKey) + + // Start volume server + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(99) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + grpcCtx, grpcCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer grpcCancel() + + // FetchAndWriteNeedle from S3 + resp, err := grpcClient.FetchAndWriteNeedle(grpcCtx, &volume_server_pb.FetchAndWriteNeedleRequest{ + VolumeId: volumeID, + NeedleId: 42, + Cookie: 12345, + Offset: 0, + Size: int64(len(testData)), + RemoteConf: &remote_pb.RemoteConf{ + Name: "test-s3", + Type: "s3", + S3AccessKey: "admin", + S3SecretKey: "admin", + S3Region: "us-east-1", + S3Endpoint: s3Endpoint, + S3ForcePathStyle: true, + }, + RemoteLocation: &remote_pb.RemoteStorageLocation{ + Name: "test-s3", + Bucket: bucket, + Path: "/" + testKey, + }, + }) + if err != nil { + t.Fatalf("FetchAndWriteNeedle failed: %v", err) + } + if resp.GetETag() == "" { + t.Fatal("FetchAndWriteNeedle returned empty e_tag") + } + t.Logf("FetchAndWriteNeedle success: e_tag=%s", resp.GetETag()) +} + +// TestFetchAndWriteNeedleFromS3WithPartialRead tests reading a byte range from S3. +func TestFetchAndWriteNeedleFromS3WithPartialRead(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + s3Endpoint, cleanupMini := startWeedMini(t) + defer cleanupMini() + + s3Client := newS3Client(s3Endpoint) + + bucket := "partial-read-test" + s3Client.CreateBucket(&s3.CreateBucketInput{Bucket: aws.String(bucket)}) + + // Upload 1KB of data + fullData := make([]byte, 1024) + for i := range fullData { + fullData[i] = byte(i % 256) + } + s3Client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), Key: aws.String("big.dat"), + Body: bytes.NewReader(fullData), + }) + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + framework.AllocateVolume(t, grpcClient, 98, "") + + grpcCtx, grpcCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer grpcCancel() + + // Fetch only bytes 100-199 (100 bytes) from the 1KB object + resp, err := grpcClient.FetchAndWriteNeedle(grpcCtx, &volume_server_pb.FetchAndWriteNeedleRequest{ + VolumeId: 98, NeedleId: 7, Cookie: 999, + Offset: 100, Size: 100, + RemoteConf: &remote_pb.RemoteConf{ + Name: "test-s3-partial", Type: "s3", + S3AccessKey: "admin", S3SecretKey: "admin", + S3Region: "us-east-1", S3Endpoint: s3Endpoint, S3ForcePathStyle: true, + }, + RemoteLocation: &remote_pb.RemoteStorageLocation{ + Name: "test-s3-partial", Bucket: bucket, Path: "/big.dat", + }, + }) + if err != nil { + t.Fatalf("FetchAndWriteNeedle partial read failed: %v", err) + } + if resp.GetETag() == "" { + t.Fatal("empty e_tag for partial read") + } + t.Logf("FetchAndWriteNeedle partial read success: e_tag=%s", resp.GetETag()) +} + +// TestFetchAndWriteNeedleS3NotFound tests that fetching a non-existent S3 object returns an error. +func TestFetchAndWriteNeedleS3NotFound(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + s3Endpoint, cleanupMini := startWeedMini(t) + defer cleanupMini() + + s3Client := newS3Client(s3Endpoint) + + bucket := "notfound-test" + s3Client.CreateBucket(&s3.CreateBucketInput{Bucket: aws.String(bucket)}) + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + framework.AllocateVolume(t, grpcClient, 97, "") + + grpcCtx, grpcCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer grpcCancel() + + _, err := grpcClient.FetchAndWriteNeedle(grpcCtx, &volume_server_pb.FetchAndWriteNeedleRequest{ + VolumeId: 97, NeedleId: 1, Cookie: 1, + Offset: 0, Size: 100, + RemoteConf: &remote_pb.RemoteConf{ + Name: "test-s3-nf", Type: "s3", + S3AccessKey: "admin", S3SecretKey: "admin", + S3Region: "us-east-1", S3Endpoint: s3Endpoint, S3ForcePathStyle: true, + }, + RemoteLocation: &remote_pb.RemoteStorageLocation{ + Name: "test-s3-nf", Bucket: bucket, Path: "/does-not-exist.dat", + }, + }) + if err == nil { + t.Fatal("FetchAndWriteNeedle should fail for non-existent object") + } + if !strings.Contains(err.Error(), "read from remote") { + t.Fatalf("expected 'read from remote' error, got: %v", err) + } + t.Logf("correctly got error for non-existent object: %v", err) +} diff --git a/test/volume_server/grpc/health_state_test.go b/test/volume_server/grpc/health_state_test.go index cac40731b..16f4627e6 100644 --- a/test/volume_server/grpc/health_state_test.go +++ b/test/volume_server/grpc/health_state_test.go @@ -16,7 +16,7 @@ func TestStateAndStatusRPCs(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -94,7 +94,7 @@ func TestSetStateVersionMismatchAndNilStateNoop(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/move_tail_timestamp_test.go b/test/volume_server/grpc/move_tail_timestamp_test.go index 8d5e01a47..32068079e 100644 --- a/test/volume_server/grpc/move_tail_timestamp_test.go +++ b/test/volume_server/grpc/move_tail_timestamp_test.go @@ -29,7 +29,7 @@ func TestVolumeCopyReturnsPreciseLastAppendTimestamp(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartDualVolumeCluster(t, matrix.P1()) + cluster := framework.StartMultiVolumeClusterAuto(t, matrix.P1(), 2) sourceConn, sourceClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(0)) defer sourceConn.Close() destConn, destClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(1)) @@ -156,7 +156,7 @@ func TestVolumeMoveHandlesInFlightWrites(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartDualVolumeCluster(t, matrix.P1()) + cluster := framework.StartMultiVolumeClusterAuto(t, matrix.P1(), 2) sourceConn, sourceClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(0)) defer sourceConn.Close() destConn, destClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(1)) diff --git a/test/volume_server/grpc/production_features_test.go b/test/volume_server/grpc/production_features_test.go new file mode 100644 index 000000000..7bc28cb75 --- /dev/null +++ b/test/volume_server/grpc/production_features_test.go @@ -0,0 +1,338 @@ +package volume_server_grpc_test + +import ( + "context" + "io" + "net/http" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/storage/idx" + "github.com/seaweedfs/seaweedfs/weed/storage/types" +) + +func TestScrubVolumeDetectsHealthyData(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(101) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + needles := []struct { + needleID uint64 + cookie uint32 + body string + }{ + {needleID: 1010001, cookie: 0xAA000001, body: "scrub-healthy-needle-one"}, + {needleID: 1010002, cookie: 0xAA000002, body: "scrub-healthy-needle-two"}, + {needleID: 1010003, cookie: 0xAA000003, body: "scrub-healthy-needle-three"}, + } + for _, n := range needles { + fid := framework.NewFileID(volumeID, n.needleID, n.cookie) + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fid, []byte(n.body)) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload needle %d expected 201, got %d", n.needleID, uploadResp.StatusCode) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + scrubResp, err := grpcClient.ScrubVolume(ctx, &volume_server_pb.ScrubVolumeRequest{ + VolumeIds: []uint32{volumeID}, + Mode: volume_server_pb.VolumeScrubMode_FULL, + }) + if err != nil { + t.Fatalf("ScrubVolume FULL mode failed: %v", err) + } + if scrubResp.GetTotalVolumes() != 1 { + t.Fatalf("ScrubVolume expected total_volumes=1, got %d", scrubResp.GetTotalVolumes()) + } + if scrubResp.GetTotalFiles() < 3 { + t.Fatalf("ScrubVolume expected total_files >= 3, got %d", scrubResp.GetTotalFiles()) + } + if len(scrubResp.GetBrokenVolumeIds()) != 0 { + t.Fatalf("ScrubVolume expected no broken volumes for healthy data, got %v: %v", scrubResp.GetBrokenVolumeIds(), scrubResp.GetDetails()) + } +} + +func TestScrubVolumeLocalModeWithMultipleVolumes(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeIDA = uint32(102) + const volumeIDB = uint32(103) + framework.AllocateVolume(t, grpcClient, volumeIDA, "") + framework.AllocateVolume(t, grpcClient, volumeIDB, "") + + httpClient := framework.NewHTTPClient() + + fidA := framework.NewFileID(volumeIDA, 1020001, 0xBB000001) + uploadA := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fidA, []byte("scrub-local-vol-a")) + _ = framework.ReadAllAndClose(t, uploadA) + if uploadA.StatusCode != http.StatusCreated { + t.Fatalf("upload to volume A expected 201, got %d", uploadA.StatusCode) + } + + fidB := framework.NewFileID(volumeIDB, 1030001, 0xBB000002) + uploadB := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fidB, []byte("scrub-local-vol-b")) + _ = framework.ReadAllAndClose(t, uploadB) + if uploadB.StatusCode != http.StatusCreated { + t.Fatalf("upload to volume B expected 201, got %d", uploadB.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + scrubResp, err := grpcClient.ScrubVolume(ctx, &volume_server_pb.ScrubVolumeRequest{ + Mode: volume_server_pb.VolumeScrubMode_LOCAL, + }) + if err != nil { + t.Fatalf("ScrubVolume LOCAL auto-select failed: %v", err) + } + if scrubResp.GetTotalVolumes() < 2 { + t.Fatalf("ScrubVolume LOCAL expected total_volumes >= 2, got %d", scrubResp.GetTotalVolumes()) + } + if len(scrubResp.GetBrokenVolumeIds()) != 0 { + t.Fatalf("ScrubVolume LOCAL expected no broken volumes, got %v: %v", scrubResp.GetBrokenVolumeIds(), scrubResp.GetDetails()) + } +} + +func TestVolumeServerStatusReturnsRealDiskStats(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + statusResp, err := grpcClient.VolumeServerStatus(ctx, &volume_server_pb.VolumeServerStatusRequest{}) + if err != nil { + t.Fatalf("VolumeServerStatus failed: %v", err) + } + + diskStatuses := statusResp.GetDiskStatuses() + if len(diskStatuses) == 0 { + t.Fatalf("VolumeServerStatus expected non-empty disk_statuses") + } + + foundValid := false + for _, ds := range diskStatuses { + if ds.GetDir() != "" && ds.GetAll() > 0 && ds.GetFree() > 0 { + foundValid = true + break + } + } + if !foundValid { + t.Fatalf("VolumeServerStatus expected at least one disk status with Dir, All > 0, Free > 0; got %v", diskStatuses) + } +} + +func TestReadNeedleBlobAndMetaVerifiesCookie(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(104) + const needleID = uint64(1040001) + const cookie = uint32(0xCC000001) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, needleID, cookie) + payload := []byte("read-needle-blob-meta-verify") + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fid, payload) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + fileStatus, err := grpcClient.ReadVolumeFileStatus(ctx, &volume_server_pb.ReadVolumeFileStatusRequest{VolumeId: volumeID}) + if err != nil { + t.Fatalf("ReadVolumeFileStatus failed: %v", err) + } + if fileStatus.GetIdxFileSize() == 0 { + t.Fatalf("expected non-zero idx file size after upload") + } + + idxBytes := prodCopyFileBytes(t, grpcClient, &volume_server_pb.CopyFileRequest{ + VolumeId: volumeID, + Ext: ".idx", + CompactionRevision: fileStatus.GetCompactionRevision(), + StopOffset: fileStatus.GetIdxFileSize(), + }) + offset, size := prodFindNeedleOffsetAndSize(t, idxBytes, needleID) + + blobResp, err := grpcClient.ReadNeedleBlob(ctx, &volume_server_pb.ReadNeedleBlobRequest{ + VolumeId: volumeID, + Offset: offset, + Size: size, + }) + if err != nil { + t.Fatalf("ReadNeedleBlob failed: %v", err) + } + if len(blobResp.GetNeedleBlob()) == 0 { + t.Fatalf("ReadNeedleBlob returned empty blob") + } + + metaResp, err := grpcClient.ReadNeedleMeta(ctx, &volume_server_pb.ReadNeedleMetaRequest{ + VolumeId: volumeID, + NeedleId: needleID, + Offset: offset, + Size: size, + }) + if err != nil { + t.Fatalf("ReadNeedleMeta failed: %v", err) + } + if metaResp.GetCookie() != cookie { + t.Fatalf("ReadNeedleMeta cookie mismatch: got %d want %d", metaResp.GetCookie(), cookie) + } + if metaResp.GetCrc() == 0 { + t.Fatalf("ReadNeedleMeta expected non-zero CRC") + } +} + +func TestBatchDeleteMultipleNeedles(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(105) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + type needle struct { + needleID uint64 + cookie uint32 + body string + fid string + } + needles := []needle{ + {needleID: 1050001, cookie: 0xDD000001, body: "batch-del-needle-one"}, + {needleID: 1050002, cookie: 0xDD000002, body: "batch-del-needle-two"}, + {needleID: 1050003, cookie: 0xDD000003, body: "batch-del-needle-three"}, + } + fids := make([]string, len(needles)) + for i := range needles { + needles[i].fid = framework.NewFileID(volumeID, needles[i].needleID, needles[i].cookie) + fids[i] = needles[i].fid + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), needles[i].fid, []byte(needles[i].body)) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload needle %d expected 201, got %d", needles[i].needleID, uploadResp.StatusCode) + } + } + + // Verify all needles are readable before delete + for _, n := range needles { + readResp := framework.ReadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), n.fid) + _ = framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusOK { + t.Fatalf("pre-delete read of %s expected 200, got %d", n.fid, readResp.StatusCode) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + deleteResp, err := grpcClient.BatchDelete(ctx, &volume_server_pb.BatchDeleteRequest{ + FileIds: fids, + }) + if err != nil { + t.Fatalf("BatchDelete failed: %v", err) + } + if len(deleteResp.GetResults()) != 3 { + t.Fatalf("BatchDelete expected 3 results, got %d", len(deleteResp.GetResults())) + } + for i, result := range deleteResp.GetResults() { + if result.GetStatus() != http.StatusAccepted { + t.Fatalf("BatchDelete result[%d] expected status 202, got %d (error: %s)", i, result.GetStatus(), result.GetError()) + } + if result.GetSize() <= 0 { + t.Fatalf("BatchDelete result[%d] expected size > 0, got %d", i, result.GetSize()) + } + } + + // Verify all needles return 404 after delete + for _, n := range needles { + readResp := framework.ReadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), n.fid) + _ = framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusNotFound { + t.Fatalf("post-delete read of %s expected 404, got %d", n.fid, readResp.StatusCode) + } + } +} + +// prodCopyFileBytes streams a CopyFile response into a byte slice. +func prodCopyFileBytes(t testing.TB, grpcClient volume_server_pb.VolumeServerClient, req *volume_server_pb.CopyFileRequest) []byte { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + stream, err := grpcClient.CopyFile(ctx, req) + if err != nil { + t.Fatalf("CopyFile start failed: %v", err) + } + + var out []byte + for { + msg, recvErr := stream.Recv() + if recvErr == io.EOF { + return out + } + if recvErr != nil { + t.Fatalf("CopyFile recv failed: %v", recvErr) + } + out = append(out, msg.GetFileContent()...) + } +} + +// prodFindNeedleOffsetAndSize scans idx bytes for a needle's offset and size. +func prodFindNeedleOffsetAndSize(t testing.TB, idxBytes []byte, needleID uint64) (offset int64, size int32) { + t.Helper() + + for i := 0; i+types.NeedleMapEntrySize <= len(idxBytes); i += types.NeedleMapEntrySize { + key, entryOffset, entrySize := idx.IdxFileEntry(idxBytes[i : i+types.NeedleMapEntrySize]) + if uint64(key) != needleID { + continue + } + if entryOffset.IsZero() || entrySize <= 0 { + continue + } + return entryOffset.ToActualOffset(), int32(entrySize) + } + + t.Fatalf("needle id %d not found in idx entries", needleID) + return 0, 0 +} diff --git a/test/volume_server/grpc/scrub_query_test.go b/test/volume_server/grpc/scrub_query_test.go index 9ddfddead..a4a776df2 100644 --- a/test/volume_server/grpc/scrub_query_test.go +++ b/test/volume_server/grpc/scrub_query_test.go @@ -17,7 +17,7 @@ func TestScrubVolumeIndexAndUnsupportedMode(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -55,7 +55,7 @@ func TestScrubEcVolumeMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -79,7 +79,7 @@ func TestScrubEcVolumeAutoSelectNoEcVolumes(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -105,7 +105,7 @@ func TestQueryInvalidAndMissingFileIDPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -149,7 +149,7 @@ func TestScrubVolumeAutoSelectAndAllModes(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -210,7 +210,7 @@ func TestQueryJsonSuccessAndCsvNoOutput(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -285,7 +285,7 @@ func TestQueryJsonNoMatchReturnsEmptyStripe(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -341,7 +341,7 @@ func TestQueryCookieMismatchReturnsEOFNoResults(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/tail_test.go b/test/volume_server/grpc/tail_test.go index 09657edb5..599450794 100644 --- a/test/volume_server/grpc/tail_test.go +++ b/test/volume_server/grpc/tail_test.go @@ -19,7 +19,7 @@ func TestVolumeTailSenderMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -40,7 +40,7 @@ func TestVolumeTailSenderHeartbeatThenEOF(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -78,7 +78,7 @@ func TestVolumeTailReceiverMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -144,7 +144,7 @@ func TestVolumeTailSenderLargeNeedleChunking(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/tiering_remote_test.go b/test/volume_server/grpc/tiering_remote_test.go index db36e7cfd..472aa1255 100644 --- a/test/volume_server/grpc/tiering_remote_test.go +++ b/test/volume_server/grpc/tiering_remote_test.go @@ -17,7 +17,7 @@ func TestFetchAndWriteNeedleMaintenanceAndMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -60,7 +60,7 @@ func TestFetchAndWriteNeedleInvalidRemoteConfig(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -94,7 +94,7 @@ func TestVolumeTierMoveDatToRemoteErrorPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -161,7 +161,7 @@ func TestVolumeTierMoveDatToRemoteMissingBackend(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -190,7 +190,7 @@ func TestVolumeTierMoveDatFromRemoteErrorPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/vacuum_test.go b/test/volume_server/grpc/vacuum_test.go index ea986fed2..deb8c298d 100644 --- a/test/volume_server/grpc/vacuum_test.go +++ b/test/volume_server/grpc/vacuum_test.go @@ -16,7 +16,7 @@ func TestVacuumVolumeCheckSuccessAndMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -45,7 +45,7 @@ func TestVacuumMaintenanceModeRejections(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/admin_test.go b/test/volume_server/http/admin_test.go index 6dde9c20d..df0f9d5c8 100644 --- a/test/volume_server/http/admin_test.go +++ b/test/volume_server/http/admin_test.go @@ -16,7 +16,7 @@ func TestAdminStatusAndHealthz(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() statusReq, err := http.NewRequest(http.MethodGet, cluster.VolumeAdminURL()+"/status", nil) @@ -45,6 +45,19 @@ func TestAdminStatusAndHealthz(t *testing.T) { t.Fatalf("status payload missing field %q", field) } } + diskStatuses, ok := payload["DiskStatuses"].([]interface{}) + if !ok || len(diskStatuses) == 0 { + t.Fatalf("status payload expected non-empty DiskStatuses, got %#v", payload["DiskStatuses"]) + } + firstDisk, ok := diskStatuses[0].(map[string]interface{}) + if !ok { + t.Fatalf("status payload disk status has unexpected shape: %#v", diskStatuses[0]) + } + for _, field := range []string{"dir", "all", "used", "free"} { + if _, found := firstDisk[field]; !found { + t.Fatalf("status disk payload missing field %q: %#v", field, firstDisk) + } + } healthReq := mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/healthz") healthResp := framework.DoRequest(t, client, healthReq) @@ -74,7 +87,7 @@ func TestOptionsMethodsByPort(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P2()) + cluster := framework.StartVolumeCluster(t, matrix.P2()) client := framework.NewHTTPClient() adminResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodOptions, cluster.VolumeAdminURL()+"/")) @@ -114,7 +127,7 @@ func TestOptionsWithOriginIncludesCorsHeaders(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P2()) + cluster := framework.StartVolumeCluster(t, matrix.P2()) client := framework.NewHTTPClient() adminReq := mustNewRequest(t, http.MethodOptions, cluster.VolumeAdminURL()+"/") @@ -151,7 +164,7 @@ func TestUiIndexNotExposedWhenJwtSigningEnabled(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P3()) + cluster := framework.StartVolumeCluster(t, matrix.P3()) client := framework.NewHTTPClient() resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/ui/index.html")) diff --git a/test/volume_server/http/auth_test.go b/test/volume_server/http/auth_test.go index 5b093bba1..fc2fb3f16 100644 --- a/test/volume_server/http/auth_test.go +++ b/test/volume_server/http/auth_test.go @@ -18,7 +18,7 @@ func TestJWTAuthForWriteAndRead(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -81,7 +81,7 @@ func TestJWTAuthRejectsFidMismatch(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -161,7 +161,7 @@ func TestJWTAuthRejectsExpiredTokens(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -209,7 +209,7 @@ func TestJWTAuthViaQueryParamAndCookie(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -249,7 +249,7 @@ func TestJWTTokenSourcePrecedenceQueryOverHeader(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -299,7 +299,7 @@ func TestJWTTokenSourcePrecedenceHeaderOverCookie(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -345,7 +345,7 @@ func TestJWTTokenSourcePrecedenceQueryOverCookie(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/chunk_manifest_test.go b/test/volume_server/http/chunk_manifest_test.go index d3806d7f4..c8ae6cd92 100644 --- a/test/volume_server/http/chunk_manifest_test.go +++ b/test/volume_server/http/chunk_manifest_test.go @@ -16,7 +16,7 @@ func TestChunkManifestExpansionAndBypass(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -97,7 +97,7 @@ func TestChunkManifestDeleteRemovesChildChunks(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -174,7 +174,7 @@ func TestChunkManifestDeleteFailsWhenChildDeletionFails(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/compressed_read_test.go b/test/volume_server/http/compressed_read_test.go index 8a9ac5c41..0101a20c0 100644 --- a/test/volume_server/http/compressed_read_test.go +++ b/test/volume_server/http/compressed_read_test.go @@ -43,7 +43,7 @@ func TestCompressedReadAcceptEncodingMatrix(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/headers_static_test.go b/test/volume_server/http/headers_static_test.go index 5b4a2fd93..82ac8adc6 100644 --- a/test/volume_server/http/headers_static_test.go +++ b/test/volume_server/http/headers_static_test.go @@ -15,7 +15,7 @@ func TestReadPassthroughHeadersAndDownloadDisposition(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -59,12 +59,50 @@ func TestReadPassthroughHeadersAndDownloadDisposition(t *testing.T) { } } +func TestDownloadDispositionUsesGoBoolParsing(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(97) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + fullFileID := framework.NewFileID(volumeID, 661123, 0x55667789) + uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), fullFileID, []byte("dl-bool-parse-content")) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + parts := strings.SplitN(fullFileID, ",", 2) + if len(parts) != 2 { + t.Fatalf("unexpected file id format: %q", fullFileID) + } + fidOnly := parts[1] + + url := fmt.Sprintf("%s/%d/%s/%s?dl=t", clusterHarness.VolumeAdminURL(), volumeID, fidOnly, "report.txt") + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, url)) + _ = framework.ReadAllAndClose(t, resp) + if resp.StatusCode != http.StatusOK { + t.Fatalf("download read expected 200, got %d", resp.StatusCode) + } + contentDisposition := resp.Header.Get("Content-Disposition") + if !strings.Contains(contentDisposition, "attachment") || !strings.Contains(contentDisposition, "report.txt") { + t.Fatalf("download disposition with dl=t mismatch: %q", contentDisposition) + } +} + func TestStaticAssetEndpoints(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() faviconResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, clusterHarness.VolumeAdminURL()+"/favicon.ico")) @@ -85,7 +123,7 @@ func TestStaticAssetEndpointsOnPublicPort(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) client := framework.NewHTTPClient() faviconResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, clusterHarness.VolumePublicURL()+"/favicon.ico")) diff --git a/test/volume_server/http/image_transform_test.go b/test/volume_server/http/image_transform_test.go index 222fc951f..a94eba1a2 100644 --- a/test/volume_server/http/image_transform_test.go +++ b/test/volume_server/http/image_transform_test.go @@ -45,7 +45,7 @@ func TestImageResizeAndCropReadVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/production_features_test.go b/test/volume_server/http/production_features_test.go new file mode 100644 index 000000000..b91ee37ef --- /dev/null +++ b/test/volume_server/http/production_features_test.go @@ -0,0 +1,387 @@ +package volume_server_http_test + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "mime/multipart" + "net/http" + "os" + "strings" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" +) + +func TestStatsEndpoints(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + // /stats/counter — expect 200 with non-empty body + // Note: Go server guards these with WhiteList which may return 400 + counterResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/stats/counter")) + counterBody := framework.ReadAllAndClose(t, counterResp) + if counterResp.StatusCode == http.StatusBadRequest { + t.Logf("/stats/counter returned 400 (whitelist guard), skipping stats checks") + return + } + if counterResp.StatusCode != http.StatusOK { + t.Fatalf("/stats/counter expected 200, got %d, body: %s", counterResp.StatusCode, string(counterBody)) + } + if len(counterBody) == 0 { + t.Fatalf("/stats/counter returned empty body") + } + + // /stats/memory — expect 200, valid JSON with Version and Memory + memoryResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/stats/memory")) + memoryBody := framework.ReadAllAndClose(t, memoryResp) + if memoryResp.StatusCode != http.StatusOK { + t.Fatalf("/stats/memory expected 200, got %d, body: %s", memoryResp.StatusCode, string(memoryBody)) + } + var memoryPayload map[string]any + if err := json.Unmarshal(memoryBody, &memoryPayload); err != nil { + t.Fatalf("/stats/memory response is not valid JSON: %v, body: %s", err, string(memoryBody)) + } + if _, ok := memoryPayload["Version"]; !ok { + t.Fatalf("/stats/memory missing Version field") + } + if _, ok := memoryPayload["Memory"]; !ok { + t.Fatalf("/stats/memory missing Memory field") + } + + // /stats/disk — expect 200, valid JSON with Version and DiskStatuses + diskResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/stats/disk")) + diskBody := framework.ReadAllAndClose(t, diskResp) + if diskResp.StatusCode != http.StatusOK { + t.Fatalf("/stats/disk expected 200, got %d, body: %s", diskResp.StatusCode, string(diskBody)) + } + var diskPayload map[string]any + if err := json.Unmarshal(diskBody, &diskPayload); err != nil { + t.Fatalf("/stats/disk response is not valid JSON: %v, body: %s", err, string(diskBody)) + } + if _, ok := diskPayload["Version"]; !ok { + t.Fatalf("/stats/disk missing Version field") + } + if _, ok := diskPayload["DiskStatuses"]; !ok { + t.Fatalf("/stats/disk missing DiskStatuses field") + } +} + +func TestStatusPrettyJsonAndJsonp(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + // ?pretty=y — expect indented multi-line JSON + prettyResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status?pretty=y")) + prettyBody := framework.ReadAllAndClose(t, prettyResp) + if prettyResp.StatusCode != http.StatusOK { + t.Fatalf("/status?pretty=y expected 200, got %d", prettyResp.StatusCode) + } + lines := strings.Split(strings.TrimSpace(string(prettyBody)), "\n") + if len(lines) < 3 { + t.Fatalf("/status?pretty=y expected multi-line indented JSON, got %d lines: %s", len(lines), string(prettyBody)) + } + // Verify the body is valid JSON + var prettyPayload map[string]interface{} + if err := json.Unmarshal(prettyBody, &prettyPayload); err != nil { + t.Fatalf("/status?pretty=y is not valid JSON: %v", err) + } + + // ?callback=myFunc — expect JSONP wrapping + jsonpResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status?callback=myFunc")) + jsonpBody := framework.ReadAllAndClose(t, jsonpResp) + if jsonpResp.StatusCode != http.StatusOK { + t.Fatalf("/status?callback=myFunc expected 200, got %d", jsonpResp.StatusCode) + } + bodyStr := string(jsonpBody) + if !strings.HasPrefix(bodyStr, "myFunc(") { + t.Fatalf("/status?callback=myFunc expected body to start with 'myFunc(', got prefix: %q", bodyStr[:min(len(bodyStr), 30)]) + } + trimmed := strings.TrimRight(bodyStr, "\n; ") + if !strings.HasSuffix(trimmed, ")") { + t.Fatalf("/status?callback=myFunc expected body to end with ')', got suffix: %q", trimmed[max(0, len(trimmed)-10):]) + } + // Content-Type should be application/javascript for JSONP + if ct := jsonpResp.Header.Get("Content-Type"); !strings.Contains(ct, "javascript") { + t.Fatalf("/status?callback=myFunc expected Content-Type containing 'javascript', got %q", ct) + } +} + +func TestUploadWithCustomTimestamp(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(91) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 910001, 0xAABBCC01) + client := framework.NewHTTPClient() + data := []byte("custom-timestamp-data") + + // Upload with ?ts=1700000000 + uploadURL := fmt.Sprintf("%s/%s?ts=1700000000", cluster.VolumeAdminURL(), fid) + req, err := http.NewRequest(http.MethodPost, uploadURL, bytes.NewReader(data)) + if err != nil { + t.Fatalf("create upload request: %v", err) + } + req.Header.Set("Content-Type", "application/octet-stream") + req.Header.Set("Content-Length", fmt.Sprintf("%d", len(data))) + uploadResp := framework.DoRequest(t, client, req) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload with ts expected 201, got %d", uploadResp.StatusCode) + } + + // Read back and verify Last-Modified + getResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read expected 200, got %d", getResp.StatusCode) + } + + expectedLastModified := time.Unix(1700000000, 0).UTC().Format(http.TimeFormat) + gotLastModified := getResp.Header.Get("Last-Modified") + if gotLastModified != expectedLastModified { + t.Fatalf("Last-Modified mismatch: got %q, want %q", gotLastModified, expectedLastModified) + } +} + +func TestMultipartUploadUsesFormFieldsForTimestampAndTTL(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + // Go's r.FormValue() cannot read multipart text fields after r.MultipartReader() + // consumes the body, so ts/ttl sent as multipart fields only work with the Rust server. + if os.Getenv("VOLUME_SERVER_IMPL") != "rust" { + t.Skip("skipping: multipart form field extraction for ts/ttl is Rust-only") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(94) + const needleID = uint64(940001) + const cookie = uint32(0xAABBCC04) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, needleID, cookie) + payload := []byte("multipart-form-fields-data") + + var body bytes.Buffer + writer := multipart.NewWriter(&body) + if err := writer.WriteField("ts", "1700000000"); err != nil { + t.Fatalf("write multipart ts field: %v", err) + } + if err := writer.WriteField("ttl", "7d"); err != nil { + t.Fatalf("write multipart ttl field: %v", err) + } + filePart, err := writer.CreateFormFile("file", "multipart.txt") + if err != nil { + t.Fatalf("create multipart file field: %v", err) + } + if _, err := filePart.Write(payload); err != nil { + t.Fatalf("write multipart file payload: %v", err) + } + if err := writer.Close(); err != nil { + t.Fatalf("close multipart writer: %v", err) + } + + req, err := http.NewRequest(http.MethodPost, cluster.VolumeAdminURL()+"/"+fid, &body) + if err != nil { + t.Fatalf("create multipart upload request: %v", err) + } + req.Header.Set("Content-Type", writer.FormDataContentType()) + + client := framework.NewHTTPClient() + uploadResp := framework.DoRequest(t, client, req) + uploadBody := framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("multipart upload expected 201, got %d, body: %s", uploadResp.StatusCode, string(uploadBody)) + } + + readResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusOK { + t.Fatalf("multipart upload read expected 200, got %d", readResp.StatusCode) + } + expectedLastModified := time.Unix(1700000000, 0).UTC().Format(http.TimeFormat) + if got := readResp.Header.Get("Last-Modified"); got != expectedLastModified { + t.Fatalf("multipart upload Last-Modified mismatch: got %q want %q", got, expectedLastModified) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + statusResp, err := grpcClient.VolumeNeedleStatus(ctx, &volume_server_pb.VolumeNeedleStatusRequest{ + VolumeId: volumeID, + NeedleId: needleID, + }) + if err != nil { + t.Fatalf("VolumeNeedleStatus after multipart upload failed: %v", err) + } + // Go's ReadTTL normalizes via fitTtlCount: 7d → 1w (7 days = 1 week) + if got := statusResp.GetTtl(); got != "1w" { + t.Fatalf("multipart upload TTL mismatch: got %q want %q", got, "1w") + } +} + +func TestRequestIdGeneration(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + // GET /status WITHOUT setting x-amz-request-id header + req := mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status") + resp := framework.DoRequest(t, client, req) + _ = framework.ReadAllAndClose(t, resp) + if resp.StatusCode != http.StatusOK { + t.Fatalf("/status expected 200, got %d", resp.StatusCode) + } + + reqID := resp.Header.Get("x-amz-request-id") + if reqID == "" { + t.Fatalf("expected auto-generated x-amz-request-id header, got empty") + } + // Go format: "%X%08X" (timestamp hex + 8 random hex), typically 20-24 chars, all hex, no hyphens. + if len(reqID) < 16 { + t.Fatalf("x-amz-request-id too short: %q (len=%d)", reqID, len(reqID)) + } +} + +func TestS3ResponsePassthroughHeaders(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(92) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 920001, 0xAABBCC02) + client := framework.NewHTTPClient() + data := []byte("passthrough-headers-test-data") + + uploadResp := framework.UploadBytes(t, client, cluster.VolumeAdminURL(), fid, data) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + // Read back with S3 passthrough query params + // Test response-content-language which both Go and Rust support + readURL := fmt.Sprintf("%s/%s?response-content-language=fr&response-expires=%s", + cluster.VolumeAdminURL(), fid, + "Thu,+01+Jan+2099+00:00:00+GMT", + ) + readResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, readURL)) + readBody := framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusOK { + t.Fatalf("read with passthrough expected 200, got %d, body: %s", readResp.StatusCode, string(readBody)) + } + + if got := readResp.Header.Get("Content-Language"); got != "fr" { + t.Fatalf("Content-Language expected 'fr', got %q", got) + } + if got := readResp.Header.Get("Expires"); got != "Thu, 01 Jan 2099 00:00:00 GMT" { + t.Fatalf("Expires expected 'Thu, 01 Jan 2099 00:00:00 GMT', got %q", got) + } +} + +func TestLargeFileWriteAndRead(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(93) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 930001, 0xAABBCC03) + client := framework.NewHTTPClient() + data := bytes.Repeat([]byte("A"), 1024*1024) // 1MB + + uploadResp := framework.UploadBytes(t, client, cluster.VolumeAdminURL(), fid, data) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload 1MB expected 201, got %d", uploadResp.StatusCode) + } + + getResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + getBody := framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read 1MB expected 200, got %d", getResp.StatusCode) + } + if len(getBody) != len(data) { + t.Fatalf("read 1MB body length mismatch: got %d, want %d", len(getBody), len(data)) + } + if !bytes.Equal(getBody, data) { + t.Fatalf("read 1MB body content mismatch") + } +} + +func TestUploadWithContentTypePreservation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(94) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 940001, 0xAABBCC04) + client := framework.NewHTTPClient() + data := []byte("fake-png-data-for-content-type-test") + + // Upload with Content-Type: image/png + uploadURL := fmt.Sprintf("%s/%s", cluster.VolumeAdminURL(), fid) + req, err := http.NewRequest(http.MethodPost, uploadURL, bytes.NewReader(data)) + if err != nil { + t.Fatalf("create upload request: %v", err) + } + req.Header.Set("Content-Type", "image/png") + req.Header.Set("Content-Length", fmt.Sprintf("%d", len(data))) + uploadResp := framework.DoRequest(t, client, req) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload with image/png expected 201, got %d", uploadResp.StatusCode) + } + + // Read back and verify Content-Type is preserved + getResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read expected 200, got %d", getResp.StatusCode) + } + if got := getResp.Header.Get("Content-Type"); got != "image/png" { + t.Fatalf("Content-Type expected 'image/png', got %q", got) + } +} diff --git a/test/volume_server/http/public_cors_methods_test.go b/test/volume_server/http/public_cors_methods_test.go index 5328b9a8b..df98d3454 100644 --- a/test/volume_server/http/public_cors_methods_test.go +++ b/test/volume_server/http/public_cors_methods_test.go @@ -15,7 +15,7 @@ func TestPublicPortReadOnlyMethodBehavior(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -70,7 +70,7 @@ func TestCorsAndUnsupportedMethodBehavior(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -133,7 +133,7 @@ func TestUnsupportedMethodTraceParity(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -168,7 +168,7 @@ func TestUnsupportedMethodPropfindParity(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -207,56 +207,12 @@ func TestUnsupportedMethodPropfindParity(t *testing.T) { } } -func TestUnsupportedMethodConnectParity(t *testing.T) { - if testing.Short() { - t.Skip("skipping integration test in short mode") - } - - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) - conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) - defer conn.Close() - - const volumeID = uint32(85) - framework.AllocateVolume(t, grpcClient, volumeID, "") - - fid := framework.NewFileID(volumeID, 124001, 0x03030303) - client := framework.NewHTTPClient() - uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), fid, []byte("connect-method-check")) - _ = framework.ReadAllAndClose(t, uploadResp) - if uploadResp.StatusCode != http.StatusCreated { - t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) - } - - adminReq := mustNewRequest(t, "CONNECT", clusterHarness.VolumeAdminURL()+"/"+fid) - adminResp := framework.DoRequest(t, client, adminReq) - _ = framework.ReadAllAndClose(t, adminResp) - if adminResp.StatusCode != http.StatusBadRequest { - t.Fatalf("admin CONNECT expected 400, got %d", adminResp.StatusCode) - } - - publicReq := mustNewRequest(t, "CONNECT", clusterHarness.VolumePublicURL()+"/"+fid) - publicResp := framework.DoRequest(t, client, publicReq) - _ = framework.ReadAllAndClose(t, publicResp) - if publicResp.StatusCode != http.StatusOK { - t.Fatalf("public CONNECT expected passthrough 200, got %d", publicResp.StatusCode) - } - - verifyResp := framework.ReadBytes(t, client, clusterHarness.VolumeAdminURL(), fid) - verifyBody := framework.ReadAllAndClose(t, verifyResp) - if verifyResp.StatusCode != http.StatusOK { - t.Fatalf("verify GET expected 200, got %d", verifyResp.StatusCode) - } - if string(verifyBody) != "connect-method-check" { - t.Fatalf("CONNECT should not mutate data, got %q", string(verifyBody)) - } -} - func TestPublicPortHeadReadParity(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/range_variants_test.go b/test/volume_server/http/range_variants_test.go index 2e1f5e286..71ffb6dff 100644 --- a/test/volume_server/http/range_variants_test.go +++ b/test/volume_server/http/range_variants_test.go @@ -14,7 +14,7 @@ func TestMultiRangeReadReturnsMultipartPayload(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -52,7 +52,7 @@ func TestOversizedCombinedRangesAreIgnored(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/read_deleted_test.go b/test/volume_server/http/read_deleted_test.go index 23d400e23..b2db65d70 100644 --- a/test/volume_server/http/read_deleted_test.go +++ b/test/volume_server/http/read_deleted_test.go @@ -13,7 +13,7 @@ func TestReadDeletedQueryReturnsDeletedNeedleData(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/read_path_variants_test.go b/test/volume_server/http/read_path_variants_test.go index 97a7ac628..eac72079a 100644 --- a/test/volume_server/http/read_path_variants_test.go +++ b/test/volume_server/http/read_path_variants_test.go @@ -15,7 +15,7 @@ func TestReadPathShapesAndIfModifiedSince(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -85,7 +85,7 @@ func TestMalformedVidFidPathReturnsBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, clusterHarness.VolumeAdminURL()+"/not-a-vid/not-a-fid")) @@ -100,7 +100,7 @@ func TestReadWrongCookieReturnsNotFound(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -139,7 +139,7 @@ func TestConditionalHeaderPrecedenceAndInvalidIfModifiedSince(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/read_write_delete_test.go b/test/volume_server/http/read_write_delete_test.go index b122d697c..b26c4f661 100644 --- a/test/volume_server/http/read_write_delete_test.go +++ b/test/volume_server/http/read_write_delete_test.go @@ -14,7 +14,7 @@ func TestUploadReadRangeHeadDeleteRoundTrip(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -112,7 +112,7 @@ func TestInvalidReadPathReturnsBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/invalid,needle")) diff --git a/test/volume_server/http/replication_lifecycle_test.go b/test/volume_server/http/replication_lifecycle_test.go new file mode 100644 index 000000000..c88ffeae6 --- /dev/null +++ b/test/volume_server/http/replication_lifecycle_test.go @@ -0,0 +1,63 @@ +package volume_server_http_test + +import ( + "context" + "net/http" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/storage/needle" +) + +func TestReplicatedUploadSucceedsImmediatelyAfterAllocate(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartDualVolumeCluster(t, matrix.P1()) + + conn0, grpc0 := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress(0)) + defer conn0.Close() + conn1, grpc1 := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress(1)) + defer conn1.Close() + + const volumeID = uint32(115) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req := &volume_server_pb.AllocateVolumeRequest{ + VolumeId: volumeID, + Replication: "001", + Version: uint32(needle.GetCurrentVersion()), + } + if _, err := grpc0.AllocateVolume(ctx, req); err != nil { + t.Fatalf("allocate replicated volume on node0: %v", err) + } + if _, err := grpc1.AllocateVolume(ctx, req); err != nil { + t.Fatalf("allocate replicated volume on node1: %v", err) + } + + client := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, 881001, 0x0B0C0D0E) + payload := []byte("replicated-upload-after-allocate") + + uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(0), fid, payload) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("replicated upload expected 201, got %d", uploadResp.StatusCode) + } + + replicaReadURL := clusterHarness.VolumeAdminURL(1) + "/" + fid + var replicaBody []byte + if !waitForHTTPStatus(t, client, replicaReadURL, http.StatusOK, 10*time.Second, func(resp *http.Response) { + replicaBody = framework.ReadAllAndClose(t, resp) + }) { + t.Fatalf("replica did not become readable within deadline") + } + if string(replicaBody) != string(payload) { + t.Fatalf("replica body mismatch: got %q want %q", string(replicaBody), string(payload)) + } +} diff --git a/test/volume_server/http/throttling_test.go b/test/volume_server/http/throttling_test.go index 7a66e9ebb..e07c441d8 100644 --- a/test/volume_server/http/throttling_test.go +++ b/test/volume_server/http/throttling_test.go @@ -60,7 +60,7 @@ func TestUploadLimitTimeoutAndReplicateBypass(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -143,7 +143,7 @@ func TestUploadLimitWaitThenProceed(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -226,7 +226,7 @@ func TestUploadLimitTimeoutThenRecovery(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -289,7 +289,7 @@ func TestDownloadLimitTimeoutReturnsTooManyRequests(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -336,7 +336,7 @@ func TestDownloadLimitWaitThenProceedWithoutReplica(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -398,7 +398,7 @@ func TestDownloadLimitTimeoutThenRecovery(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -593,7 +593,7 @@ func TestUploadLimitDisabledAllowsConcurrentUploads(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -646,7 +646,7 @@ func TestDownloadLimitDisabledAllowsConcurrentDownloads(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -691,7 +691,7 @@ func TestDownloadLimitInvalidVidWhileOverLimitReturnsBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/write_delete_variants_test.go b/test/volume_server/http/write_delete_variants_test.go index 3355e7778..6a017299e 100644 --- a/test/volume_server/http/write_delete_variants_test.go +++ b/test/volume_server/http/write_delete_variants_test.go @@ -14,7 +14,7 @@ func TestWriteUnchangedAndDeleteEdgeVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -71,7 +71,7 @@ func TestDeleteTimestampOverrideKeepsReadDeletedLastModifiedParity(t *testing.T) t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/write_error_variants_test.go b/test/volume_server/http/write_error_variants_test.go index ead11ed6c..2f858b41b 100644 --- a/test/volume_server/http/write_error_variants_test.go +++ b/test/volume_server/http/write_error_variants_test.go @@ -14,7 +14,7 @@ func TestWriteInvalidVidAndFidReturnBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() invalidVidReq := newUploadRequest(t, clusterHarness.VolumeAdminURL()+"/invalid,12345678", []byte("x")) @@ -37,7 +37,7 @@ func TestWriteMalformedMultipartAndMD5Mismatch(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/loadtest/loadtest_test.go b/test/volume_server/loadtest/loadtest_test.go new file mode 100644 index 000000000..d3d0d0fc1 --- /dev/null +++ b/test/volume_server/loadtest/loadtest_test.go @@ -0,0 +1,628 @@ +package loadtest + +import ( + "bytes" + "crypto/rand" + "fmt" + "io" + "net/http" + "os" + "sort" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// Run with: +// go test -v -count=1 -timeout 300s -run BenchmarkVolumeServer ./test/volume_server/loadtest/... +// VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 300s -run BenchmarkVolumeServer ./test/volume_server/loadtest/... +// +// Compare results: +// go test -count=1 -timeout 300s -run BenchmarkVolumeServer -bench . ./test/volume_server/loadtest/... | tee go.txt +// VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 300s -run BenchmarkVolumeServer -bench . ./test/volume_server/loadtest/... | tee rust.txt + +// Step-by-step payload sizes: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB +var payloadSteps = []struct { + name string + size int +}{ + {"1KB", 1 << 10}, + {"4KB", 4 << 10}, + {"16KB", 16 << 10}, + {"64KB", 64 << 10}, + {"256KB", 256 << 10}, + {"1MB", 1 << 20}, + {"4MB", 4 << 20}, + {"8MB", 8 << 20}, +} + +func implName() string { + if os.Getenv("VOLUME_SERVER_IMPL") == "rust" { + return "rust" + } + return "go" +} + +// setupCluster starts a volume cluster and returns the admin URL and cleanup. +func setupCluster(tb testing.TB) (adminURL string, grpcAddr string, cleanup func()) { + tb.Helper() + cluster := framework.StartVolumeCluster(tb, matrix.P1()) + return cluster.VolumeAdminURL(), cluster.VolumeGRPCAddress(), cluster.Stop +} + +// allocateVolume allocates a volume via gRPC and returns its ID. +func allocateVolume(tb testing.TB, grpcAddr string, volumeID uint32) { + tb.Helper() + conn, client := framework.DialVolumeServer(tb, grpcAddr) + defer conn.Close() + framework.AllocateVolume(tb, client, volumeID, "loadtest") +} + +func makePayload(size int) []byte { + data := make([]byte, size) + rand.Read(data) + return data +} + +// uploadFile uploads data and returns the file ID used. +func uploadFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32, data []byte) error { + fid := framework.NewFileID(volumeID, key, cookie) + url := fmt.Sprintf("%s/%s", adminURL, fid) + req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/octet-stream") + resp, err := client.Do(req) + if err != nil { + return err + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + if resp.StatusCode >= 400 { + return fmt.Errorf("upload %s: status %d", fid, resp.StatusCode) + } + return nil +} + +// downloadFile reads a file and discards the body. +func downloadFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32) error { + fid := framework.NewFileID(volumeID, key, cookie) + url := fmt.Sprintf("%s/%s", adminURL, fid) + resp, err := client.Get(url) + if err != nil { + return err + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + if resp.StatusCode >= 400 { + return fmt.Errorf("download %s: status %d", fid, resp.StatusCode) + } + return nil +} + +// deleteFile deletes a file. +func deleteFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32) error { + fid := framework.NewFileID(volumeID, key, cookie) + url := fmt.Sprintf("%s/%s", adminURL, fid) + req, err := http.NewRequest(http.MethodDelete, url, nil) + if err != nil { + return err + } + resp, err := client.Do(req) + if err != nil { + return err + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + return nil +} + +// --- Throughput load tests (not Go benchmarks, manual timing for comparison) --- + +// TestBenchmarkVolumeServer runs a suite of load tests printing ops/sec and latency. +func TestBenchmarkVolumeServer(t *testing.T) { + if testing.Short() { + t.Skip("skipping load test in short mode") + } + + impl := implName() + adminURL, grpcAddr, cleanup := setupCluster(t) + defer cleanup() + + const volumeID = uint32(10) + allocateVolume(t, grpcAddr, volumeID) + + httpClient := &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + MaxIdleConnsPerHost: 128, + MaxConnsPerHost: 128, + }, + } + + // opsForSize returns fewer ops for larger payloads to keep test time reasonable. + opsForSize := func(size, concurrency int) int { + switch { + case size >= 4<<20: + if concurrency > 1 { + return 64 + } + return 30 + case size >= 1<<20: + if concurrency > 1 { + return 200 + } + return 100 + case size >= 64<<10: + if concurrency > 1 { + return 500 + } + return 300 + default: + if concurrency > 1 { + return 1000 + } + return 500 + } + } + + // Step-by-step upload: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB + for _, ps := range payloadSteps { + for _, mode := range []struct { + label string + concurrency int + }{ + {"seq", 1}, + {"c16", 16}, + } { + name := fmt.Sprintf("Upload/%s/%s", ps.name, mode.label) + numOps := opsForSize(ps.size, mode.concurrency) + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + runThroughputTest(t, impl, name, httpClient, adminURL, volumeID, + payload, numOps, mode.concurrency, false, false) + }) + } + } + + // Step-by-step download: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB + for _, ps := range payloadSteps { + for _, mode := range []struct { + label string + concurrency int + }{ + {"seq", 1}, + {"c16", 16}, + } { + name := fmt.Sprintf("Download/%s/%s", ps.name, mode.label) + numOps := opsForSize(ps.size, mode.concurrency) + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + runThroughputTest(t, impl, name, httpClient, adminURL, volumeID, + payload, numOps, mode.concurrency, true, false) + }) + } + } + + // Mixed read/write at each size + for _, ps := range payloadSteps { + name := fmt.Sprintf("Mixed/%s/c16", ps.name) + numOps := opsForSize(ps.size, 16) + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + runThroughputTest(t, impl, name, httpClient, adminURL, volumeID, + payload, numOps, 16, false, true) + }) + } + + // Delete test + t.Run(fmt.Sprintf("%s/Delete/1KB/c16", impl), func(t *testing.T) { + payload := makePayload(1 << 10) + numOps := 1000 + baseKey := uint64(900000) + + for i := 0; i < numOps; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 1, payload); err != nil { + t.Fatalf("pre-upload for delete %d: %v", i, err) + } + } + + var ops atomic.Int64 + var totalLatency atomic.Int64 + + start := time.Now() + var wg sync.WaitGroup + concurrency := 16 + opsPerWorker := numOps / concurrency + + for w := 0; w < concurrency; w++ { + workerBase := baseKey + uint64(w*opsPerWorker) + wg.Add(1) + go func(wb uint64) { + defer wg.Done() + for i := 0; i < opsPerWorker; i++ { + opStart := time.Now() + deleteFile(httpClient, adminURL, volumeID, wb+uint64(i), 1) + totalLatency.Add(time.Since(opStart).Nanoseconds()) + ops.Add(1) + } + }(workerBase) + } + wg.Wait() + elapsed := time.Since(start) + + totalOps := ops.Load() + avgLatencyUs := float64(totalLatency.Load()) / float64(totalOps) / 1000.0 + opsPerSec := float64(totalOps) / elapsed.Seconds() + + t.Logf("RESULT impl=%-4s test=%-22s ops=%-6d errors=%-4d elapsed=%-10s ops/s=%-10.1f avg_lat=%-10.0fus", + impl, "Delete/1KB/c16", totalOps, 0, elapsed.Round(time.Millisecond), opsPerSec, avgLatencyUs) + }) +} + +// runThroughputTest is the shared core for throughput tests. +// keyOffset separates key ranges so concurrent tests in the same volume don't collide. +// keyCounter provides globally unique key ranges. Starts at 1 because key=0 is invalid. +var keyCounter atomic.Uint64 + +func init() { + keyCounter.Store(1) +} + +func runThroughputTest( + t *testing.T, impl, name string, + httpClient *http.Client, adminURL string, volumeID uint32, + payload []byte, numOps, concurrency int, + isDownload, isMixed bool, +) { + t.Helper() + + // Each call gets a unique key range + baseKey := keyCounter.Add(uint64(numOps*2)) - uint64(numOps*2) + + // Pre-upload for download / mixed + if isDownload || isMixed { + for i := 0; i < numOps; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 1, payload); err != nil { + t.Fatalf("pre-upload %d: %v", i, err) + } + } + } + + uploadBase := baseKey + if !isDownload && !isMixed { + uploadBase = baseKey + uint64(numOps) // fresh range for uploads + } + + var ops atomic.Int64 + var errors atomic.Int64 + var totalLatency atomic.Int64 + + start := time.Now() + + var wg sync.WaitGroup + opsPerWorker := numOps / concurrency + remainder := numOps % concurrency + + for w := 0; w < concurrency; w++ { + n := opsPerWorker + if w < remainder { + n++ + } + var workerBase uint64 + if w < remainder { + workerBase = uploadBase + uint64(w*(opsPerWorker+1)) + } else { + workerBase = uploadBase + uint64(remainder*(opsPerWorker+1)) + uint64((w-remainder)*opsPerWorker) + } + + wg.Add(1) + go func(wb uint64, count int) { + defer wg.Done() + for i := 0; i < count; i++ { + key := wb + uint64(i) + opStart := time.Now() + var err error + + if isMixed { + if i%2 == 0 { + err = uploadFile(httpClient, adminURL, volumeID, key, 1, payload) + } else { + err = downloadFile(httpClient, adminURL, volumeID, key, 1) + } + } else if isDownload { + err = downloadFile(httpClient, adminURL, volumeID, key, 1) + } else { + err = uploadFile(httpClient, adminURL, volumeID, key, 1, payload) + } + + totalLatency.Add(time.Since(opStart).Nanoseconds()) + ops.Add(1) + if err != nil { + errors.Add(1) + } + } + }(workerBase, n) + } + + wg.Wait() + elapsed := time.Since(start) + + totalOps := ops.Load() + totalErrs := errors.Load() + avgLatencyUs := float64(totalLatency.Load()) / float64(totalOps) / 1000.0 + opsPerSec := float64(totalOps) / elapsed.Seconds() + throughputMBs := opsPerSec * float64(len(payload)) / (1024 * 1024) + + t.Logf("RESULT impl=%-4s test=%-22s ops=%-6d errors=%-4d elapsed=%-10s ops/s=%-10.1f avg_lat=%-10.0fus throughput=%.2f MB/s", + impl, name, totalOps, totalErrs, elapsed.Round(time.Millisecond), opsPerSec, avgLatencyUs, throughputMBs) +} + +// TestLatencyPercentiles measures p50/p95/p99 latencies for upload and download at each size. +func TestLatencyPercentiles(t *testing.T) { + if testing.Short() { + t.Skip("skipping load test in short mode") + } + + impl := implName() + adminURL, grpcAddr, cleanup := setupCluster(t) + defer cleanup() + + const volumeID = uint32(20) + allocateVolume(t, grpcAddr, volumeID) + + httpClient := &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + MaxIdleConnsPerHost: 64, + MaxConnsPerHost: 64, + }, + } + + latOpsForSize := func(size int) int { + switch { + case size >= 4<<20: + return 30 + case size >= 1<<20: + return 100 + default: + return 300 + } + } + + for _, ps := range payloadSteps { + for _, dl := range []struct { + prefix string + isDownload bool + }{ + {"Upload", false}, + {"Download", true}, + } { + name := fmt.Sprintf("%s/%s", dl.prefix, ps.name) + numOps := latOpsForSize(ps.size) + + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + baseKey := keyCounter.Add(uint64(numOps * 2)) + + if dl.isDownload { + for i := 0; i < numOps; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 2, payload); err != nil { + t.Fatalf("pre-upload: %v", err) + } + } + } + + uploadBase := baseKey + if !dl.isDownload { + uploadBase = baseKey + uint64(numOps) + } + + latencies := make([]time.Duration, numOps) + for i := 0; i < numOps; i++ { + key := uploadBase + uint64(i) + start := time.Now() + if dl.isDownload { + downloadFile(httpClient, adminURL, volumeID, key, 2) + } else { + uploadFile(httpClient, adminURL, volumeID, key, 2, payload) + } + latencies[i] = time.Since(start) + } + + sortDurations(latencies) + + p50 := latencies[len(latencies)*50/100] + p95 := latencies[len(latencies)*95/100] + p99 := latencies[len(latencies)*99/100] + min := latencies[0] + max := latencies[len(latencies)-1] + + t.Logf("RESULT impl=%-4s test=%-20s n=%-4d min=%-10s p50=%-10s p95=%-10s p99=%-10s max=%-10s", + impl, name, numOps, min.Round(time.Microsecond), p50.Round(time.Microsecond), p95.Round(time.Microsecond), p99.Round(time.Microsecond), max.Round(time.Microsecond)) + }) + } + } +} + +func sortDurations(d []time.Duration) { + sort.Slice(d, func(i, j int) bool { return d[i] < d[j] }) +} + +// TestSustainedP99 runs high-concurrency load for a sustained period (default 60s, +// override with LOADTEST_DURATION=120s) and reports p50/p95/p99/p999 latencies. +// This reveals tail latency differences that short tests miss (GC pauses, lock contention, etc). +// +// Run: +// go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/... +// VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/... +// LOADTEST_DURATION=120s VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/... +func TestSustainedP99(t *testing.T) { + if testing.Short() { + t.Skip("skipping sustained load test in short mode") + } + + duration := 60 * time.Second + if d := os.Getenv("LOADTEST_DURATION"); d != "" { + parsed, err := time.ParseDuration(d) + if err == nil && parsed > 0 { + duration = parsed + } + } + + impl := implName() + adminURL, grpcAddr, cleanup := setupCluster(t) + defer cleanup() + + httpClient := &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + MaxIdleConnsPerHost: 128, + MaxConnsPerHost: 128, + }, + } + + type scenario struct { + name string + size int + concurrency int + isDownload bool + } + + scenarios := []scenario{ + {"Upload/1KB/c16", 1 << 10, 16, false}, + {"Upload/64KB/c16", 64 << 10, 16, false}, + {"Download/1KB/c16", 1 << 10, 16, true}, + {"Download/64KB/c16", 64 << 10, 16, true}, + } + + var nextVolID atomic.Uint32 + nextVolID.Store(30) + + for _, sc := range scenarios { + t.Run(fmt.Sprintf("%s/%s", impl, sc.name), func(t *testing.T) { + // Each scenario gets its own volume to avoid filling up + volumeID := nextVolID.Add(1) - 1 + allocateVolume(t, grpcAddr, volumeID) + + payload := makePayload(sc.size) + + // Pre-upload a pool of files for download tests + poolSize := 500 + baseKey := keyCounter.Add(uint64(poolSize*2)) - uint64(poolSize*2) + + if sc.isDownload { + t.Logf("Pre-uploading %d files for download test...", poolSize) + for i := 0; i < poolSize; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 3, payload); err != nil { + t.Fatalf("pre-upload %d: %v", i, err) + } + } + } + + // Collect latencies from all workers + type latencyBucket struct { + mu sync.Mutex + latencies []time.Duration + } + bucket := &latencyBucket{ + latencies: make([]time.Duration, 0, 100000), + } + + var totalOps atomic.Int64 + var totalErrors atomic.Int64 + + deadline := time.Now().Add(duration) + start := time.Now() + + // For uploads, pre-seed the pool so subsequent writes are overwrites (no volume fill) + if !sc.isDownload { + t.Logf("Pre-seeding %d files for upload overwrite test...", poolSize) + for i := 0; i < poolSize; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 3, payload); err != nil { + t.Fatalf("pre-seed %d: %v", i, err) + } + } + } + + var wg sync.WaitGroup + for w := 0; w < sc.concurrency; w++ { + wg.Add(1) + go func(workerID int) { + defer wg.Done() + localLats := make([]time.Duration, 0, 8192) + + var i uint64 + for time.Now().Before(deadline) { + // Cycle through the pool to avoid filling up the volume + key := baseKey + uint64(int(i)%poolSize) + + opStart := time.Now() + var err error + if sc.isDownload { + err = downloadFile(httpClient, adminURL, volumeID, key, 3) + } else { + err = uploadFile(httpClient, adminURL, volumeID, key, 3, payload) + } + lat := time.Since(opStart) + + localLats = append(localLats, lat) + totalOps.Add(1) + if err != nil { + totalErrors.Add(1) + } + i++ + + // Flush local buffer periodically + if len(localLats) >= 8192 { + bucket.mu.Lock() + bucket.latencies = append(bucket.latencies, localLats...) + bucket.mu.Unlock() + localLats = localLats[:0] + } + } + // Final flush + if len(localLats) > 0 { + bucket.mu.Lock() + bucket.latencies = append(bucket.latencies, localLats...) + bucket.mu.Unlock() + } + }(w) + } + + wg.Wait() + elapsed := time.Since(start) + + lats := bucket.latencies + n := len(lats) + ops := totalOps.Load() + errs := totalErrors.Load() + opsPerSec := float64(ops) / elapsed.Seconds() + + sortDurations(lats) + + pct := func(p float64) time.Duration { + idx := int(float64(n) * p / 100.0) + if idx >= n { + idx = n - 1 + } + return lats[idx] + } + + t.Logf("RESULT impl=%-4s test=%-22s duration=%-6s ops=%-8d errors=%-4d ops/s=%-10.1f", + impl, sc.name, elapsed.Round(time.Second), ops, errs, opsPerSec) + t.Logf(" p50=%-10s p90=%-10s p95=%-10s p99=%-10s p999=%-10s max=%-10s", + pct(50).Round(time.Microsecond), + pct(90).Round(time.Microsecond), + pct(95).Round(time.Microsecond), + pct(99).Round(time.Microsecond), + pct(99.9).Round(time.Microsecond), + lats[n-1].Round(time.Microsecond)) + }) + } +} diff --git a/test/volume_server/matrix/config_profiles.go b/test/volume_server/matrix/config_profiles.go index c359eb029..e01e35fc1 100644 --- a/test/volume_server/matrix/config_profiles.go +++ b/test/volume_server/matrix/config_profiles.go @@ -12,6 +12,7 @@ type Profile struct { EnableJWT bool JWTSigningKey string JWTReadKey string + EnableUIAccess bool EnableMaintain bool ConcurrentUploadLimitMB int diff --git a/test/volume_server/rust/rust_volume_test.go b/test/volume_server/rust/rust_volume_test.go new file mode 100644 index 000000000..6f1a0b74a --- /dev/null +++ b/test/volume_server/rust/rust_volume_test.go @@ -0,0 +1,310 @@ +package volume_server_rust_test + +import ( + "context" + "encoding/json" + "net/http" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" +) + +func mustNewRequest(t testing.TB, method, url string) *http.Request { + t.Helper() + req, err := http.NewRequest(method, url, nil) + if err != nil { + t.Fatalf("create request %s %s: %v", method, url, err) + } + return req +} + +// TestRustHealthzEndpoint verifies that the Rust volume server responds to +// GET /healthz with HTTP 200. +func TestRustHealthzEndpoint(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/healthz")) + _ = framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected /healthz 200, got %d", resp.StatusCode) + } +} + +// TestRustStatusEndpoint verifies that GET /status returns 200 with a JSON +// body containing a "version" field. The Rust server uses lowercase field +// names in its axum JSON responses. +func TestRustStatusEndpoint(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status")) + body := framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected /status 200, got %d, body: %s", resp.StatusCode, string(body)) + } + + var payload map[string]interface{} + if err := json.Unmarshal(body, &payload); err != nil { + t.Fatalf("decode /status JSON: %v", err) + } + + if _, ok := payload["Version"]; !ok { + t.Fatalf("/status JSON missing \"Version\" field, keys: %v", keys(payload)) + } +} + +// TestRustPingRPC verifies the gRPC Ping RPC returns non-zero timestamps. +func TestRustPingRPC(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + resp, err := client.Ping(ctx, &volume_server_pb.PingRequest{}) + if err != nil { + t.Fatalf("Ping RPC failed: %v", err) + } + if resp.GetStartTimeNs() == 0 { + t.Fatalf("Ping StartTimeNs should be non-zero") + } + if resp.GetStopTimeNs() == 0 { + t.Fatalf("Ping StopTimeNs should be non-zero") + } + if resp.GetStopTimeNs() < resp.GetStartTimeNs() { + t.Fatalf("Ping StopTimeNs (%d) should be >= StartTimeNs (%d)", resp.GetStopTimeNs(), resp.GetStartTimeNs()) + } +} + +// TestRustAllocateAndWriteReadDelete exercises the full needle lifecycle: +// allocate a volume via gRPC, upload bytes via HTTP POST, read them back +// via HTTP GET, delete via HTTP DELETE, then confirm GET returns 404. +func TestRustAllocateAndWriteReadDelete(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(1) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, 1001, 0xAABBCCDD) + data := []byte("rust-volume-server-integration-test-payload") + + // Upload + uploadResp := framework.UploadBytes(t, httpClient, cluster.VolumeAdminURL(), fid, data) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + // Read back + getResp := framework.ReadBytes(t, httpClient, cluster.VolumeAdminURL(), fid) + getBody := framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read expected 200, got %d", getResp.StatusCode) + } + if string(getBody) != string(data) { + t.Fatalf("read body mismatch: got %q, want %q", string(getBody), string(data)) + } + + // Delete + deleteResp := framework.DoRequest(t, httpClient, mustNewRequest(t, http.MethodDelete, cluster.VolumeAdminURL()+"/"+fid)) + _ = framework.ReadAllAndClose(t, deleteResp) + if deleteResp.StatusCode != http.StatusAccepted && deleteResp.StatusCode != http.StatusOK { + t.Fatalf("delete expected 202 or 200, got %d", deleteResp.StatusCode) + } + + // Verify 404 after delete + gone := framework.ReadBytes(t, httpClient, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, gone) + if gone.StatusCode != http.StatusNotFound { + t.Fatalf("read after delete expected 404, got %d", gone.StatusCode) + } +} + +// TestRustVolumeLifecycle tests the volume admin gRPC lifecycle: +// allocate, check status, unmount, mount, delete. +func TestRustVolumeLifecycle(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + const volumeID = uint32(2) + framework.AllocateVolume(t, client, volumeID, "") + + // VolumeStatus should succeed on a freshly allocated volume. + statusResp, err := client.VolumeStatus(ctx, &volume_server_pb.VolumeStatusRequest{VolumeId: volumeID}) + if err != nil { + t.Fatalf("VolumeStatus failed: %v", err) + } + if statusResp.GetFileCount() != 0 { + t.Fatalf("new volume should be empty, got file_count=%d", statusResp.GetFileCount()) + } + + // Unmount then remount. + if _, err = client.VolumeUnmount(ctx, &volume_server_pb.VolumeUnmountRequest{VolumeId: volumeID}); err != nil { + t.Fatalf("VolumeUnmount failed: %v", err) + } + if _, err = client.VolumeMount(ctx, &volume_server_pb.VolumeMountRequest{VolumeId: volumeID}); err != nil { + t.Fatalf("VolumeMount failed: %v", err) + } + + // Delete. + if _, err = client.VolumeDelete(ctx, &volume_server_pb.VolumeDeleteRequest{VolumeId: volumeID, OnlyEmpty: true}); err != nil { + t.Fatalf("VolumeDelete failed: %v", err) + } + + // VolumeStatus should fail after delete. + _, err = client.VolumeStatus(ctx, &volume_server_pb.VolumeStatusRequest{VolumeId: volumeID}) + if err == nil { + t.Fatalf("VolumeStatus should fail after delete") + } +} + +// TestRustGetSetState verifies GetState returns a non-nil state and SetState +// echoes the state back. +func TestRustGetSetState(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + // GetState should return non-nil state. + getResp, err := client.GetState(ctx, &volume_server_pb.GetStateRequest{}) + if err != nil { + t.Fatalf("GetState failed: %v", err) + } + if getResp.GetState() == nil { + t.Fatalf("GetState returned nil state") + } + + // SetState should echo back the state. + setResp, err := client.SetState(ctx, &volume_server_pb.SetStateRequest{ + State: &volume_server_pb.VolumeServerState{ + Version: getResp.GetState().GetVersion(), + }, + }) + if err != nil { + t.Fatalf("SetState failed: %v", err) + } + if setResp.GetState() == nil { + t.Fatalf("SetState returned nil state") + } + if setResp.GetState().GetVersion() < getResp.GetState().GetVersion() { + t.Fatalf("SetState version should not decrease: got %d, had %d", + setResp.GetState().GetVersion(), getResp.GetState().GetVersion()) + } +} + +// TestRustVolumeServerStatus verifies VolumeServerStatus returns a version +// string and at least one disk status entry. +func TestRustVolumeServerStatus(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + resp, err := client.VolumeServerStatus(ctx, &volume_server_pb.VolumeServerStatusRequest{}) + if err != nil { + t.Fatalf("VolumeServerStatus failed: %v", err) + } + if resp.GetVersion() == "" { + t.Fatalf("VolumeServerStatus returned empty version") + } + if len(resp.GetDiskStatuses()) == 0 { + t.Fatalf("VolumeServerStatus returned no disk statuses") + } +} + +// TestRustMetricsEndpointIsNotOnAdminPortByDefault verifies that the default +// volume admin listener does not expose Prometheus metrics. Go serves metrics +// only on the dedicated metrics listener when -metricsPort is configured. +func TestRustMetricsEndpointIsNotOnAdminPortByDefault(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/metrics")) + body := framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("expected admin /metrics 400 when metricsPort is unset, got %d body=%s", resp.StatusCode, string(body)) + } +} + +func TestRustUiAccessOverrideIgnoresReadJwt(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + profile := matrix.P3() + profile.EnableUIAccess = true + + cluster := framework.StartRustVolumeCluster(t, profile) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/ui/index.html")) + body := framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected /ui/index.html 200 with access.ui override, got %d body=%s", resp.StatusCode, string(body)) + } + if len(body) == 0 { + t.Fatalf("expected non-empty UI response body") + } +} + +// keys returns the keys of a map for diagnostic messages. +func keys(m map[string]interface{}) []string { + ks := make([]string, 0, len(m)) + for k := range m { + ks = append(ks, k) + } + return ks +} diff --git a/weed/pb/Makefile b/weed/pb/Makefile index ad90e1fe5..94f5f668d 100644 --- a/weed/pb/Makefile +++ b/weed/pb/Makefile @@ -18,6 +18,7 @@ gen: protoc plugin.proto --go_out=./plugin_pb --go-grpc_out=./plugin_pb --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative # protoc filer.proto --java_out=../../other/java/client/src/main/java cp filer.proto ../../other/java/client/src/main/proto + cp volume_server.proto master.proto remote.proto ../../seaweed-volume/proto/ fbs: flatc --go -o . --go-namespace message_fbs message.fbs diff --git a/weed/storage/volume.go b/weed/storage/volume.go index 48149f4d9..468491c10 100644 --- a/weed/storage/volume.go +++ b/weed/storage/volume.go @@ -94,7 +94,7 @@ func (v *Volume) IndexFileName() (fileName string) { func (v *Volume) FileName(ext string) (fileName string) { switch ext { - case ".idx", ".cpx", ".ldb", ".cpldb": + case ".idx", ".cpx", ".ldb", ".cpldb", ".rdb": return VolumeFileName(v.dirIdx, v.Collection, int(v.Id)) + ext } // .dat, .cpd, .vif diff --git a/weed/storage/volume_vacuum.go b/weed/storage/volume_vacuum.go index e97342597..c5027204c 100644 --- a/weed/storage/volume_vacuum.go +++ b/weed/storage/volume_vacuum.go @@ -201,6 +201,7 @@ func (v *Volume) CommitCompact() error { //time.Sleep(20 * time.Second) os.RemoveAll(v.FileName(".ldb")) + os.Remove(v.FileName(".rdb")) glog.V(3).Infof("Loading volume %d commit file...", v.Id) if e = v.load(true, false, v.needleMapKind, 0, v.Version()); e != nil { diff --git a/weed/storage/volume_write.go b/weed/storage/volume_write.go index 2f832e1f7..e78bb2b3a 100644 --- a/weed/storage/volume_write.go +++ b/weed/storage/volume_write.go @@ -99,6 +99,8 @@ func removeVolumeFiles(filename string) { os.Remove(filename + ".cpx") // level db index file os.RemoveAll(filename + ".ldb") + // redb index file (Rust volume server) + os.Remove(filename + ".rdb") // marker for damaged or incomplete volume os.Remove(filename + ".note") }