138 lines
5.2 KiB
Plaintext
138 lines
5.2 KiB
Plaintext
#version 450
|
|
|
|
/*
|
|
* VKZip GPU Decompression Shader
|
|
*
|
|
* Decompresses blocks compressed by the compression shader.
|
|
* Each workgroup decompresses one block.
|
|
*
|
|
* Token format (matching compress.comp):
|
|
* - Literal: [0x00] [byte] → emit 1 byte
|
|
* - Match: [0x01] [len:u16] [dist:u16] → copy `len` bytes from `pos-dist`
|
|
*/
|
|
|
|
layout(local_size_x = 256) in;
|
|
|
|
// ── Push constants ─────────────────────────────────────────────────
|
|
layout(push_constant) uniform PushConstants {
|
|
uint block_count;
|
|
uint block_size;
|
|
uint _pad1;
|
|
uint _pad2;
|
|
} params;
|
|
|
|
// ── Buffers ────────────────────────────────────────────────────────
|
|
// Input: compressed data (all blocks concatenated)
|
|
layout(std430, set = 0, binding = 0) readonly buffer InputBuffer {
|
|
uint data[];
|
|
} input_buf;
|
|
|
|
// Output: decompressed data (read-write: needs read for match back-references)
|
|
layout(std430, set = 0, binding = 1) buffer OutputBuffer {
|
|
uint data[];
|
|
} output_buf;
|
|
|
|
// Block metadata: [block_idx] = { in_offset, in_size(compressed), out_offset, out_size(original) }
|
|
layout(std430, set = 0, binding = 2) buffer MetadataBuffer {
|
|
uvec4 blocks[];
|
|
} meta;
|
|
|
|
// ── Helper: read a byte from packed uint buffer ────────────────────
|
|
// ── Helper: read a byte from packed uint buffer ────────────────────
|
|
uint read_byte_in(uint base_offset, uint byte_idx) {
|
|
uint word_idx = (base_offset + byte_idx) >> 2;
|
|
uint byte_pos = (base_offset + byte_idx) & 3;
|
|
return (input_buf.data[word_idx] >> (byte_pos * 8)) & 0xFF;
|
|
}
|
|
|
|
void main() {
|
|
uint block_idx = gl_WorkGroupID.x;
|
|
uint thread_id = gl_LocalInvocationID.x;
|
|
|
|
if (block_idx >= params.block_count) return;
|
|
|
|
uint in_offset = meta.blocks[block_idx].x;
|
|
uint in_size = meta.blocks[block_idx].y;
|
|
uint out_offset = meta.blocks[block_idx].z;
|
|
uint out_size = meta.blocks[block_idx].w;
|
|
|
|
// Sequential decompression by thread 0
|
|
if (thread_id == 0) {
|
|
// If the block was stored raw (uncompressed), just copy it
|
|
if (in_size == out_size) {
|
|
for (uint i = 0; i < in_size; i++) {
|
|
uint b = read_byte_in(in_offset, i);
|
|
uint byte_pos = i & 3;
|
|
|
|
// Read modify write needed here to avoid clobbering since we don't word-buffer the raw yet
|
|
uint word_idx = (out_offset + i) >> 2;
|
|
atomicOr(output_buf.data[word_idx], b << (byte_pos * 8));
|
|
}
|
|
return;
|
|
}
|
|
|
|
uint in_pos = 0;
|
|
uint out_pos = 0;
|
|
|
|
uint current_word = 0;
|
|
uint current_word_idx = 0;
|
|
|
|
while (in_pos < in_size && out_pos < out_size) {
|
|
uint token = read_byte_in(in_offset, in_pos);
|
|
in_pos++;
|
|
|
|
if (token == 0x00) {
|
|
// Literal
|
|
uint b = read_byte_in(in_offset, in_pos);
|
|
in_pos++;
|
|
|
|
// Buffer output
|
|
uint byte_pos = out_pos & 3;
|
|
current_word |= (b << (byte_pos * 8));
|
|
|
|
if (byte_pos == 3 || out_pos == out_size - 1) {
|
|
output_buf.data[(out_offset + out_pos) >> 2] = current_word;
|
|
current_word = 0;
|
|
}
|
|
out_pos++;
|
|
}
|
|
else if (token == 0x01) {
|
|
// Match
|
|
uint match_len = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8);
|
|
in_pos += 2;
|
|
uint match_dist = read_byte_in(in_offset, in_pos) | (read_byte_in(in_offset, in_pos + 1) << 8);
|
|
in_pos += 2;
|
|
|
|
uint copy_src = out_pos - match_dist;
|
|
|
|
for (uint i = 0; i < match_len && out_pos < out_size; i++) {
|
|
uint src_byte_idx = copy_src + i;
|
|
uint b = 0;
|
|
|
|
// Did we write this byte fully to memory yet?
|
|
if (src_byte_idx >= (out_pos & ~3u)) {
|
|
// It is in the current word being accumulated
|
|
uint src_byte_pos = src_byte_idx & 3;
|
|
b = (current_word >> (src_byte_pos * 8)) & 0xFF;
|
|
} else {
|
|
// Read from VRAM
|
|
uint src_word = output_buf.data[(out_offset + src_byte_idx) >> 2];
|
|
uint src_byte_pos = src_byte_idx & 3;
|
|
b = (src_word >> (src_byte_pos * 8)) & 0xFF;
|
|
}
|
|
|
|
// Buffer Output
|
|
uint byte_pos = out_pos & 3;
|
|
current_word |= (b << (byte_pos * 8));
|
|
|
|
if (byte_pos == 3 || out_pos == out_size - 1) {
|
|
output_buf.data[(out_offset + out_pos) >> 2] = current_word;
|
|
current_word = 0;
|
|
}
|
|
out_pos++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|