Initial commit: VKZip GPU Compressor

2026-04-20 23:19:07 -03:00
commit 7af9f38181
66 changed files with 9444 additions and 0 deletions
@@ -0,0 +1,158 @@
+#version 450
+
+/*
+ * VKZip GPU Compression Shader
+ * 
+ * Each workgroup compresses one independent block using a simplified
+ * LZ77 variant optimized for GPU parallelism:
+ * 
+ * Algorithm:
+ * 1. Each thread scans a portion of the block for matches using hash chains
+ * 2. Matches are encoded as (distance, length) pairs
+ * 3. Non-matching bytes are stored as literals
+ * 4. Output format per token:
+ *    - Literal: [0x00] [byte]
+ *    - Match:   [0x01] [length: u16] [distance: u16]
+ *
+ * This is a simplified approach focusing on parallel match finding.
+ * The compression ratio won't match gzip/zstd, but the speed 
+ * advantage from GPU parallelism makes up for it on large files.
+ */
+
+layout(local_size_x = 256) in;
+
+// ── Push constants ─────────────────────────────────────────────────
+layout(push_constant) uniform PushConstants {
+    uint block_count;       // Total number of blocks to process
+    uint block_size;        // Size of each block (e.g., 65536)
+    uint max_match_len;     // Maximum match length
+    uint window_size;       // Sliding window size
+} params;
+
+// ── Buffers ────────────────────────────────────────────────────────
+// Input: raw uncompressed data (all blocks concatenated)
+layout(std430, set = 0, binding = 0) readonly buffer InputBuffer {
+    uint data[];
+} input_buf;
+
+// Output: compressed data (pre-allocated with worst-case size)
+layout(std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
+    uint data[];
+} output_buf;
+
+// Block metadata: [block_idx] = { input_offset, input_size, output_offset, output_size }
+layout(std430, set = 0, binding = 2) buffer MetadataBuffer {
+    uvec4 blocks[];  // x=in_offset, y=in_size, z=out_offset, w=out_size(result)
+} meta;
+
+// ── Shared memory for workgroup ────────────────────────────────────
+shared uint s_hash_table[4096];  // Hash table for match finding
+shared uint s_output_pos;        // Current output position (atomic)
+
+// ── Helper: read a byte from packed uint buffer ────────────────────
+uint read_byte(uint base_offset, uint byte_idx) {
+    uint word_idx = (base_offset + byte_idx) >> 2;
+    uint byte_pos = (base_offset + byte_idx) & 3;
+    return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF;
+}
+
+// ── Helper: write a byte to packed uint buffer ─────────────────────
+void write_byte(uint base_offset, uint byte_idx, uint value) {
+    uint word_idx = (base_offset + byte_idx) >> 2;
+    uint byte_pos = (base_offset + byte_idx) & 3;
+    atomicOr(output_buf.data[word_idx], (value & 0xFF) << (byte_pos * 8));
+}
+
+// ── Hash function for string matching ──────────────────────────────
+uint hash3(uint base_offset, uint pos) {
+    uint b0 = read_byte(base_offset, pos);
+    uint b1 = read_byte(base_offset, pos + 1);
+    uint b2 = read_byte(base_offset, pos + 2);
+    return ((b0 << 16) ^ (b1 << 8) ^ b2) & 0xFFF;
+}
+
+void main() {
+    uint block_idx = gl_WorkGroupID.x;
+    uint thread_id = gl_LocalInvocationID.x;
+
+    if (block_idx >= params.block_count) return;
+
+    uint in_offset  = meta.blocks[block_idx].x;
+    uint in_size    = meta.blocks[block_idx].y;
+    uint out_offset = meta.blocks[block_idx].z;
+
+    // Initialize shared memory
+    if (thread_id < 256) {
+        for (uint i = thread_id; i < 4096; i += 256) {
+            s_hash_table[i] = 0xFFFFFFFF;
+        }
+    }
+    if (thread_id == 0) {
+        s_output_pos = 0;
+    }
+    barrier();
+    memoryBarrierShared();
+
+    // ── Single-threaded compression for correctness ─────────────
+    // Thread 0 does the sequential LZ77 compression.
+    // Other threads could assist with parallel hash updates in a 
+    // more advanced version.
+    if (thread_id == 0) {
+        uint pos = 0;
+        uint out_pos = 0;
+
+        while (pos < in_size) {
+            uint best_len = 0;
+            uint best_dist = 0;
+
+            // Try to find a match (need at least 3 bytes remaining)
+            if (pos + 2 < in_size) {
+                uint h = hash3(in_offset, pos);
+                uint match_pos = s_hash_table[h];
+
+                // Scan hash chain for matches
+                if (match_pos != 0xFFFFFFFF && pos > match_pos) {
+                    uint dist = pos - match_pos;
+                    if (dist <= params.window_size && dist > 0) {
+                        // Count matching bytes
+                        uint len = 0;
+                        uint max_len = min(params.max_match_len, in_size - pos);
+                        while (len < max_len &&
+                               read_byte(in_offset, match_pos + len) ==
+                               read_byte(in_offset, pos + len)) {
+                            len++;
+                        }
+                        if (len >= 3) {
+                            best_len = len;
+                            best_dist = dist;
+                        }
+                    }
+                }
+
+                // Update hash table
+                s_hash_table[h] = pos;
+            }
+
+            if (best_len >= 3) {
+                // Write match token: [0x01] [len_lo] [len_hi] [dist_lo] [dist_hi]
+                write_byte(out_offset, out_pos++, 0x01);
+                write_byte(out_offset, out_pos++, best_len & 0xFF);
+                write_byte(out_offset, out_pos++, (best_len >> 8) & 0xFF);
+                write_byte(out_offset, out_pos++, best_dist & 0xFF);
+                write_byte(out_offset, out_pos++, (best_dist >> 8) & 0xFF);
+                pos += best_len;
+            } else {
+                // Write literal token: [0x00] [byte]
+                uint b = read_byte(in_offset, pos);
+                write_byte(out_offset, out_pos++, 0x00);
+                write_byte(out_offset, out_pos++, b);
+                pos++;
+            }
+        }
+
+        // Store output size
+        meta.blocks[block_idx].w = out_pos;
+    }
+
+    barrier();
+}