vkzip/shaders/decompress.comp

#version 450

/*
 * VKZip GPU Decompression Shader
 *
 * Decompresses blocks compressed by the compression shader.
 * Each workgroup decompresses one block.
 *
 * Token format (matching compress.comp):
 *   - Literal: [0x00] [byte]           → emit 1 byte
 *   - Match:   [0x01] [len:u16] [dist:u16]  → copy `len` bytes from `pos-dist`
 */

layout(local_size_x = 256) in;

// ── Push constants ─────────────────────────────────────────────────
layout(push_constant) uniform PushConstants {
    uint block_count;
    uint block_size;
    uint _pad1;
    uint _pad2;
} params;

// ── Buffers ────────────────────────────────────────────────────────
// Input: compressed data (all blocks concatenated)
layout(std430, set = 0, binding = 0) readonly buffer InputBuffer {
    uint data[];
} input_buf;

// Output: decompressed data (read-write: needs read for match back-references)
layout(std430, set = 0, binding = 1) buffer OutputBuffer {
    uint data[];
} output_buf;

// Block metadata: [block_idx] = { in_offset, in_size(compressed), out_offset, out_size(original) }
layout(std430, set = 0, binding = 2) buffer MetadataBuffer {
    uvec4 blocks[];
} meta;

// ── Helper: read a byte from packed uint buffer ────────────────────
// ── Helper: read a byte from packed uint buffer ────────────────────
uint read_byte_in(uint base_offset, uint byte_idx) {
    uint word_idx = (base_offset + byte_idx) >> 2;
    uint byte_pos = (base_offset + byte_idx) & 3;
    return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF;
}

void main() {
    uint block_idx = gl_WorkGroupID.x;
    uint thread_id = gl_LocalInvocationID.x;

    if (block_idx >= params.block_count) return;

    uint in_offset   = meta.blocks[block_idx].x;
    uint in_size     = meta.blocks[block_idx].y;
    uint out_offset  = meta.blocks[block_idx].z;
    uint out_size    = meta.blocks[block_idx].w;

    // Sequential decompression by thread 0
    if (thread_id == 0) {
        // If the block was stored raw (uncompressed), just copy it
        if (in_size == out_size) {
            for (uint i = 0; i < in_size; i++) {
                uint b = read_byte_in(in_offset, i);
                uint byte_pos = i & 3;

                // Read modify write needed here to avoid clobbering since we don't word-buffer the raw yet
                uint word_idx = (out_offset + i) >> 2;
                atomicOr(output_buf.data[word_idx], b << (byte_pos * 8));
            }
            return;
        }

        uint in_pos = 0;
        uint out_pos = 0;

        uint current_word = 0;
        uint current_word_idx = 0;

        while (in_pos < in_size && out_pos < out_size) {
            uint token = read_byte_in(in_offset, in_pos);
            in_pos++;

            if (token == 0x00) {
                // Literal
                uint b = read_byte_in(in_offset, in_pos);
                in_pos++;

                // Buffer output
                uint byte_pos = out_pos & 3;
                current_word |= (b << (byte_pos * 8));

                if (byte_pos == 3 || out_pos == out_size - 1) {
                    output_buf.data[(out_offset + out_pos) >> 2] = current_word;
                    current_word = 0;
                }
                out_pos++;
            }
            else if (token == 0x01) {
                // Match
                uint match_len = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8);
                in_pos += 2;
                uint match_dist = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8);
                in_pos += 2;

                uint copy_src = out_pos - match_dist;

                for (uint i = 0; i < match_len && out_pos < out_size; i++) {
                    uint src_byte_idx = copy_src + i;
                    uint b = 0;

                    // Did we write this byte fully to memory yet?
                    if (src_byte_idx >= (out_pos & ~3u)) {
                        // It is in the current word being accumulated
                        uint src_byte_pos = src_byte_idx & 3;
                        b = (current_word >> (src_byte_pos * 8)) & 0xFF;
                    } else {
                        // Read from VRAM
                        uint src_word = output_buf.data[(out_offset + src_byte_idx) >> 2];
                        uint src_byte_pos = src_byte_idx & 3;
                        b = (src_word >> (src_byte_pos * 8)) & 0xFF;
                    }

                    // Buffer Output
                    uint byte_pos = out_pos & 3;
                    current_word |= (b << (byte_pos * 8));

                    if (byte_pos == 3 || out_pos == out_size - 1) {
                        output_buf.data[(out_offset + out_pos) >> 2] = current_word;
                        current_word = 0;
                    }
                    out_pos++;
                }
            }
        }
    }
}