#version 450 /* * VKZip GPU Compression Shader * * Each workgroup compresses one independent block using a simplified * LZ77 variant optimized for GPU parallelism: * * Algorithm: * 1. Each thread scans a portion of the block for matches using hash chains * 2. Matches are encoded as (distance, length) pairs * 3. Non-matching bytes are stored as literals * 4. Output format per token: * - Literal: [0x00] [byte] * - Match: [0x01] [length: u16] [distance: u16] * * This is a simplified approach focusing on parallel match finding. * The compression ratio won't match gzip/zstd, but the speed * advantage from GPU parallelism makes up for it on large files. */ layout(local_size_x = 256) in; // ── Push constants ───────────────────────────────────────────────── layout(push_constant) uniform PushConstants { uint block_count; // Total number of blocks to process uint block_size; // Size of each block (e.g., 65536) uint max_match_len; // Maximum match length uint window_size; // Sliding window size } params; // ── Buffers ──────────────────────────────────────────────────────── // Input: raw uncompressed data (all blocks concatenated) layout(std430, set = 0, binding = 0) readonly buffer InputBuffer { uint data[]; } input_buf; // Output: compressed data (pre-allocated with worst-case size) layout(std430, set = 0, binding = 1) writeonly buffer OutputBuffer { uint data[]; } output_buf; // Block metadata: [block_idx] = { input_offset, input_size, output_offset, output_size } layout(std430, set = 0, binding = 2) buffer MetadataBuffer { uvec4 blocks[]; // x=in_offset, y=in_size, z=out_offset, w=out_size(result) } meta; // ── Shared memory for workgroup ──────────────────────────────────── shared uint s_hash_table[4096]; // Hash table for match finding shared uint s_output_pos; // Current output position (atomic) // ── Helper: read a byte from packed uint buffer ──────────────────── uint read_byte(uint base_offset, uint byte_idx) { uint word_idx = (base_offset + byte_idx) >> 2; uint byte_pos = (base_offset + byte_idx) & 3; return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF; } // ── Helper: write a byte to packed uint buffer ───────────────────── void write_byte(uint base_offset, uint byte_idx, uint value) { uint word_idx = (base_offset + byte_idx) >> 2; uint byte_pos = (base_offset + byte_idx) & 3; atomicOr(output_buf.data[word_idx], (value & 0xFF) << (byte_pos * 8)); } // ── Hash function for string matching ────────────────────────────── uint hash3(uint base_offset, uint pos) { uint b0 = read_byte(base_offset, pos); uint b1 = read_byte(base_offset, pos + 1); uint b2 = read_byte(base_offset, pos + 2); return ((b0 << 16) ^ (b1 << 8) ^ b2) & 0xFFF; } void main() { uint block_idx = gl_WorkGroupID.x; uint thread_id = gl_LocalInvocationID.x; if (block_idx >= params.block_count) return; uint in_offset = meta.blocks[block_idx].x; uint in_size = meta.blocks[block_idx].y; uint out_offset = meta.blocks[block_idx].z; // Initialize shared memory if (thread_id < 256) { for (uint i = thread_id; i < 4096; i += 256) { s_hash_table[i] = 0xFFFFFFFF; } } if (thread_id == 0) { s_output_pos = 0; } barrier(); memoryBarrierShared(); // ── Single-threaded compression for correctness ───────────── // Thread 0 does the sequential LZ77 compression. // Other threads could assist with parallel hash updates in a // more advanced version. if (thread_id == 0) { uint pos = 0; uint out_pos = 0; while (pos < in_size) { uint best_len = 0; uint best_dist = 0; // Try to find a match (need at least 3 bytes remaining) if (pos + 2 < in_size) { uint h = hash3(in_offset, pos); uint match_pos = s_hash_table[h]; // Scan hash chain for matches if (match_pos != 0xFFFFFFFF && pos > match_pos) { uint dist = pos - match_pos; if (dist <= params.window_size && dist > 0) { // Count matching bytes uint len = 0; uint max_len = min(params.max_match_len, in_size - pos); while (len < max_len && read_byte(in_offset, match_pos + len) == read_byte(in_offset, pos + len)) { len++; } if (len >= 3) { best_len = len; best_dist = dist; } } } // Update hash table s_hash_table[h] = pos; } if (best_len >= 3) { // Write match token: [0x01] [len_lo] [len_hi] [dist_lo] [dist_hi] write_byte(out_offset, out_pos++, 0x01); write_byte(out_offset, out_pos++, best_len & 0xFF); write_byte(out_offset, out_pos++, (best_len >> 8) & 0xFF); write_byte(out_offset, out_pos++, best_dist & 0xFF); write_byte(out_offset, out_pos++, (best_dist >> 8) & 0xFF); pos += best_len; } else { // Write literal token: [0x00] [byte] uint b = read_byte(in_offset, pos); write_byte(out_offset, out_pos++, 0x00); write_byte(out_offset, out_pos++, b); pos++; } } // Store output size meta.blocks[block_idx].w = out_pos; } barrier(); }