#version 450 /* * VKZip GPU Decompression Shader * * Decompresses blocks compressed by the compression shader. * Each workgroup decompresses one block. * * Token format (matching compress.comp): * - Literal: [0x00] [byte] → emit 1 byte * - Match: [0x01] [len:u16] [dist:u16] → copy `len` bytes from `pos-dist` */ layout(local_size_x = 256) in; // ── Push constants ───────────────────────────────────────────────── layout(push_constant) uniform PushConstants { uint block_count; uint block_size; uint _pad1; uint _pad2; } params; // ── Buffers ──────────────────────────────────────────────────────── // Input: compressed data (all blocks concatenated) layout(std430, set = 0, binding = 0) readonly buffer InputBuffer { uint data[]; } input_buf; // Output: decompressed data (read-write: needs read for match back-references) layout(std430, set = 0, binding = 1) buffer OutputBuffer { uint data[]; } output_buf; // Block metadata: [block_idx] = { in_offset, in_size(compressed), out_offset, out_size(original) } layout(std430, set = 0, binding = 2) buffer MetadataBuffer { uvec4 blocks[]; } meta; // ── Helper: read a byte from packed uint buffer ──────────────────── // ── Helper: read a byte from packed uint buffer ──────────────────── uint read_byte_in(uint base_offset, uint byte_idx) { uint word_idx = (base_offset + byte_idx) >> 2; uint byte_pos = (base_offset + byte_idx) & 3; return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF; } void main() { uint block_idx = gl_WorkGroupID.x; uint thread_id = gl_LocalInvocationID.x; if (block_idx >= params.block_count) return; uint in_offset = meta.blocks[block_idx].x; uint in_size = meta.blocks[block_idx].y; uint out_offset = meta.blocks[block_idx].z; uint out_size = meta.blocks[block_idx].w; // Sequential decompression by thread 0 if (thread_id == 0) { // If the block was stored raw (uncompressed), just copy it if (in_size == out_size) { for (uint i = 0; i < in_size; i++) { uint b = read_byte_in(in_offset, i); uint byte_pos = i & 3; // Read modify write needed here to avoid clobbering since we don't word-buffer the raw yet uint word_idx = (out_offset + i) >> 2; atomicOr(output_buf.data[word_idx], b << (byte_pos * 8)); } return; } uint in_pos = 0; uint out_pos = 0; uint current_word = 0; uint current_word_idx = 0; while (in_pos < in_size && out_pos < out_size) { uint token = read_byte_in(in_offset, in_pos); in_pos++; if (token == 0x00) { // Literal uint b = read_byte_in(in_offset, in_pos); in_pos++; // Buffer output uint byte_pos = out_pos & 3; current_word |= (b << (byte_pos * 8)); if (byte_pos == 3 || out_pos == out_size - 1) { output_buf.data[(out_offset + out_pos) >> 2] = current_word; current_word = 0; } out_pos++; } else if (token == 0x01) { // Match uint match_len = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8); in_pos += 2; uint match_dist = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8); in_pos += 2; uint copy_src = out_pos - match_dist; for (uint i = 0; i < match_len && out_pos < out_size; i++) { uint src_byte_idx = copy_src + i; uint b = 0; // Did we write this byte fully to memory yet? if (src_byte_idx >= (out_pos & ~3u)) { // It is in the current word being accumulated uint src_byte_pos = src_byte_idx & 3; b = (current_word >> (src_byte_pos * 8)) & 0xFF; } else { // Read from VRAM uint src_word = output_buf.data[(out_offset + src_byte_idx) >> 2]; uint src_byte_pos = src_byte_idx & 3; b = (src_word >> (src_byte_pos * 8)) & 0xFF; } // Buffer Output uint byte_pos = out_pos & 3; current_word |= (b << (byte_pos * 8)); if (byte_pos == 3 || out_pos == out_size - 1) { output_buf.data[(out_offset + out_pos) >> 2] = current_word; current_word = 0; } out_pos++; } } } } }