Initial commit: VKZip GPU Compressor

2026-04-20 23:19:07 -03:00
commit 7af9f38181
66 changed files with 9444 additions and 0 deletions
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Compile GLSL compute shaders to SPIR-V
+# Requires glslc (from Vulkan SDK) or glslangValidator
+
+SHADER_DIR="$(dirname "$0")"
+OUTPUT_DIR="${1:-$SHADER_DIR}"
+
+# Find compiler
+if command -v glslc &> /dev/null; then
+    COMPILER="glslc"
+    compile() { $COMPILER -o "$2" "$1"; }
+elif command -v glslangValidator &> /dev/null; then
+    COMPILER="glslangValidator"
+    compile() { $COMPILER -V -o "$2" "$1"; }
+else
+    echo "ERROR: No GLSL compiler found!"
+    echo "Install Vulkan SDK: https://vulkan.lunarg.com/sdk/home"
+    echo "  Ubuntu/Debian: sudo apt install vulkan-tools glslang-tools"
+    echo "  Arch:          sudo pacman -S vulkan-tools glslang"
+    echo "  Fedora:        sudo dnf install vulkan-tools glslang"
+    exit 1
+fi
+
+echo "Using compiler: $COMPILER"
+echo "Output directory: $OUTPUT_DIR"
+mkdir -p "$OUTPUT_DIR"
+
+for shader in "$SHADER_DIR"/*.comp; do
+    if [ -f "$shader" ]; then
+        name=$(basename "$shader")
+        output="$OUTPUT_DIR/${name}.spv"
+        echo "  Compiling $name -> ${name}.spv"
+        compile "$shader" "$output"
+        if [ $? -ne 0 ]; then
+            echo "  FAILED: $name"
+            exit 1
+        fi
+    fi
+done
+
+echo "All shaders compiled successfully!"
@@ -0,0 +1,158 @@
+#version 450
+
+/*
+ * VKZip GPU Compression Shader
+ * 
+ * Each workgroup compresses one independent block using a simplified
+ * LZ77 variant optimized for GPU parallelism:
+ * 
+ * Algorithm:
+ * 1. Each thread scans a portion of the block for matches using hash chains
+ * 2. Matches are encoded as (distance, length) pairs
+ * 3. Non-matching bytes are stored as literals
+ * 4. Output format per token:
+ *    - Literal: [0x00] [byte]
+ *    - Match:   [0x01] [length: u16] [distance: u16]
+ *
+ * This is a simplified approach focusing on parallel match finding.
+ * The compression ratio won't match gzip/zstd, but the speed 
+ * advantage from GPU parallelism makes up for it on large files.
+ */
+
+layout(local_size_x = 256) in;
+
+// ── Push constants ─────────────────────────────────────────────────
+layout(push_constant) uniform PushConstants {
+    uint block_count;       // Total number of blocks to process
+    uint block_size;        // Size of each block (e.g., 65536)
+    uint max_match_len;     // Maximum match length
+    uint window_size;       // Sliding window size
+} params;
+
+// ── Buffers ────────────────────────────────────────────────────────
+// Input: raw uncompressed data (all blocks concatenated)
+layout(std430, set = 0, binding = 0) readonly buffer InputBuffer {
+    uint data[];
+} input_buf;
+
+// Output: compressed data (pre-allocated with worst-case size)
+layout(std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
+    uint data[];
+} output_buf;
+
+// Block metadata: [block_idx] = { input_offset, input_size, output_offset, output_size }
+layout(std430, set = 0, binding = 2) buffer MetadataBuffer {
+    uvec4 blocks[];  // x=in_offset, y=in_size, z=out_offset, w=out_size(result)
+} meta;
+
+// ── Shared memory for workgroup ────────────────────────────────────
+shared uint s_hash_table[4096];  // Hash table for match finding
+shared uint s_output_pos;        // Current output position (atomic)
+
+// ── Helper: read a byte from packed uint buffer ────────────────────
+uint read_byte(uint base_offset, uint byte_idx) {
+    uint word_idx = (base_offset + byte_idx) >> 2;
+    uint byte_pos = (base_offset + byte_idx) & 3;
+    return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF;
+}
+
+// ── Helper: write a byte to packed uint buffer ─────────────────────
+void write_byte(uint base_offset, uint byte_idx, uint value) {
+    uint word_idx = (base_offset + byte_idx) >> 2;
+    uint byte_pos = (base_offset + byte_idx) & 3;
+    atomicOr(output_buf.data[word_idx], (value & 0xFF) << (byte_pos * 8));
+}
+
+// ── Hash function for string matching ──────────────────────────────
+uint hash3(uint base_offset, uint pos) {
+    uint b0 = read_byte(base_offset, pos);
+    uint b1 = read_byte(base_offset, pos + 1);
+    uint b2 = read_byte(base_offset, pos + 2);
+    return ((b0 << 16) ^ (b1 << 8) ^ b2) & 0xFFF;
+}
+
+void main() {
+    uint block_idx = gl_WorkGroupID.x;
+    uint thread_id = gl_LocalInvocationID.x;
+
+    if (block_idx >= params.block_count) return;
+
+    uint in_offset  = meta.blocks[block_idx].x;
+    uint in_size    = meta.blocks[block_idx].y;
+    uint out_offset = meta.blocks[block_idx].z;
+
+    // Initialize shared memory
+    if (thread_id < 256) {
+        for (uint i = thread_id; i < 4096; i += 256) {
+            s_hash_table[i] = 0xFFFFFFFF;
+        }
+    }
+    if (thread_id == 0) {
+        s_output_pos = 0;
+    }
+    barrier();
+    memoryBarrierShared();
+
+    // ── Single-threaded compression for correctness ─────────────
+    // Thread 0 does the sequential LZ77 compression.
+    // Other threads could assist with parallel hash updates in a 
+    // more advanced version.
+    if (thread_id == 0) {
+        uint pos = 0;
+        uint out_pos = 0;
+
+        while (pos < in_size) {
+            uint best_len = 0;
+            uint best_dist = 0;
+
+            // Try to find a match (need at least 3 bytes remaining)
+            if (pos + 2 < in_size) {
+                uint h = hash3(in_offset, pos);
+                uint match_pos = s_hash_table[h];
+
+                // Scan hash chain for matches
+                if (match_pos != 0xFFFFFFFF && pos > match_pos) {
+                    uint dist = pos - match_pos;
+                    if (dist <= params.window_size && dist > 0) {
+                        // Count matching bytes
+                        uint len = 0;
+                        uint max_len = min(params.max_match_len, in_size - pos);
+                        while (len < max_len &&
+                               read_byte(in_offset, match_pos + len) ==
+                               read_byte(in_offset, pos + len)) {
+                            len++;
+                        }
+                        if (len >= 3) {
+                            best_len = len;
+                            best_dist = dist;
+                        }
+                    }
+                }
+
+                // Update hash table
+                s_hash_table[h] = pos;
+            }
+
+            if (best_len >= 3) {
+                // Write match token: [0x01] [len_lo] [len_hi] [dist_lo] [dist_hi]
+                write_byte(out_offset, out_pos++, 0x01);
+                write_byte(out_offset, out_pos++, best_len & 0xFF);
+                write_byte(out_offset, out_pos++, (best_len >> 8) & 0xFF);
+                write_byte(out_offset, out_pos++, best_dist & 0xFF);
+                write_byte(out_offset, out_pos++, (best_dist >> 8) & 0xFF);
+                pos += best_len;
+            } else {
+                // Write literal token: [0x00] [byte]
+                uint b = read_byte(in_offset, pos);
+                write_byte(out_offset, out_pos++, 0x00);
+                write_byte(out_offset, out_pos++, b);
+                pos++;
+            }
+        }
+
+        // Store output size
+        meta.blocks[block_idx].w = out_pos;
+    }
+
+    barrier();
+}
@@ -0,0 +1,137 @@
+#version 450
+
+/*
+ * VKZip GPU Decompression Shader
+ *
+ * Decompresses blocks compressed by the compression shader.
+ * Each workgroup decompresses one block.
+ *
+ * Token format (matching compress.comp):
+ *   - Literal: [0x00] [byte]           → emit 1 byte
+ *   - Match:   [0x01] [len:u16] [dist:u16]  → copy `len` bytes from `pos-dist`
+ */
+
+layout(local_size_x = 256) in;
+
+// ── Push constants ─────────────────────────────────────────────────
+layout(push_constant) uniform PushConstants {
+    uint block_count;
+    uint block_size;
+    uint _pad1;
+    uint _pad2;
+} params;
+
+// ── Buffers ────────────────────────────────────────────────────────
+// Input: compressed data (all blocks concatenated)
+layout(std430, set = 0, binding = 0) readonly buffer InputBuffer {
+    uint data[];
+} input_buf;
+
+// Output: decompressed data (read-write: needs read for match back-references)
+layout(std430, set = 0, binding = 1) buffer OutputBuffer {
+    uint data[];
+} output_buf;
+
+// Block metadata: [block_idx] = { in_offset, in_size(compressed), out_offset, out_size(original) }
+layout(std430, set = 0, binding = 2) buffer MetadataBuffer {
+    uvec4 blocks[];
+} meta;
+
+// ── Helper: read a byte from packed uint buffer ────────────────────
+// ── Helper: read a byte from packed uint buffer ────────────────────
+uint read_byte_in(uint base_offset, uint byte_idx) {
+    uint word_idx = (base_offset + byte_idx) >> 2;
+    uint byte_pos = (base_offset + byte_idx) & 3;
+    return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF;
+}
+
+void main() {
+    uint block_idx = gl_WorkGroupID.x;
+    uint thread_id = gl_LocalInvocationID.x;
+
+    if (block_idx >= params.block_count) return;
+
+    uint in_offset   = meta.blocks[block_idx].x;
+    uint in_size     = meta.blocks[block_idx].y;  
+    uint out_offset  = meta.blocks[block_idx].z;
+    uint out_size    = meta.blocks[block_idx].w;
+
+    // Sequential decompression by thread 0
+    if (thread_id == 0) {
+        // If the block was stored raw (uncompressed), just copy it
+        if (in_size == out_size) {
+            for (uint i = 0; i < in_size; i++) {
+                uint b = read_byte_in(in_offset, i);
+                uint byte_pos = i & 3;
+                
+                // Read modify write needed here to avoid clobbering since we don't word-buffer the raw yet
+                uint word_idx = (out_offset + i) >> 2;
+                atomicOr(output_buf.data[word_idx], b << (byte_pos * 8));
+            }
+            return;
+        }
+
+        uint in_pos = 0;
+        uint out_pos = 0;
+
+        uint current_word = 0;
+        uint current_word_idx = 0;
+
+        while (in_pos < in_size && out_pos < out_size) {
+            uint token = read_byte_in(in_offset, in_pos);
+            in_pos++;
+
+            if (token == 0x00) {
+                // Literal
+                uint b = read_byte_in(in_offset, in_pos);
+                in_pos++;
+                
+                // Buffer output
+                uint byte_pos = out_pos & 3;
+                current_word |= (b << (byte_pos * 8));
+                
+                if (byte_pos == 3 || out_pos == out_size - 1) {
+                    output_buf.data[(out_offset + out_pos) >> 2] = current_word;
+                    current_word = 0;
+                }
+                out_pos++;
+            }
+            else if (token == 0x01) {
+                // Match
+                uint match_len = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8);
+                in_pos += 2;
+                uint match_dist = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8);
+                in_pos += 2;
+
+                uint copy_src = out_pos - match_dist;
+                
+                for (uint i = 0; i < match_len && out_pos < out_size; i++) {
+                    uint src_byte_idx = copy_src + i;
+                    uint b = 0;
+                    
+                    // Did we write this byte fully to memory yet?
+                    if (src_byte_idx >= (out_pos & ~3u)) {
+                        // It is in the current word being accumulated
+                        uint src_byte_pos = src_byte_idx & 3;
+                        b = (current_word >> (src_byte_pos * 8)) & 0xFF;
+                    } else {
+                        // Read from VRAM
+                        uint src_word = output_buf.data[(out_offset + src_byte_idx) >> 2];
+                        uint src_byte_pos = src_byte_idx & 3;
+                        b = (src_word >> (src_byte_pos * 8)) & 0xFF;
+                    }
+                    
+                    // Buffer Output
+                    uint byte_pos = out_pos & 3;
+                    current_word |= (b << (byte_pos * 8));
+                    
+                    if (byte_pos == 3 || out_pos == out_size - 1) {
+                        output_buf.data[(out_offset + out_pos) >> 2] = current_word;
+                        current_word = 0;
+                    }
+                    out_pos++;
+                }
+            }
+        }
+    }
+}