Skip to content

Commit

Permalink
several alternative implementations
Browse files Browse the repository at this point in the history
  • Loading branch information
kif committed Aug 23, 2023
1 parent 52658d9 commit d6b7983
Show file tree
Hide file tree
Showing 3 changed files with 1,584 additions and 203 deletions.
227 changes: 24 additions & 203 deletions src/silx/resources/opencl/codec/lz4_compression.cl
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@

// This is used in tests to simplify the signature of those test kernels.
#ifndef WORKGROUP_SIZE
#define WORKGROUP_SIZE 64
#define WORKGROUP_SIZE 1024
#endif
//segment size should be buffer_size/4
#ifndef SEGMENT_SIZE
#define SEGMENT_SIZE 256
#define SEGMENT_SIZE 512
#endif

#ifndef BUFFER_SIZE
#define BUFFER_SIZE 1024
#define BUFFER_SIZE 16384
#endif
#ifndef MIN_MATCH
#define MIN_MATCH 4
Expand Down Expand Up @@ -138,11 +138,11 @@ inline int compact_segments(local volatile short4 *segments,

/* This function scans the input data searching for litterals and matches. return the end-of-scan position.
*/
inline int scan4match( local uchar *buffer, // buffer with input data in it, as large as possible, limited by shared memory space.
int start,
int stop,
local short *match_buffer, // size of the wg is enough
volatile local int* cnt // size 1 is enough, idx0: largest index value found
inline int scan4match(local uchar *buffer, // buffer with input data in it, as large as possible, limited by shared memory space.
int start,
int stop,
local short *match_buffer, // size of the wg is enough
volatile local int* cnt // size 1 is enough, idx0: largest index value found
){

int wg = get_local_size(0);// workgroup size
Expand All @@ -151,6 +151,7 @@ inline int scan4match( local uchar *buffer, // buffer with input data in
cnt[0] = 0;

// memset match_buffer
// if (tid==0)printf("workgroup size is %d\n",WORKGROUP_SIZE);
match_buffer[tid] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
int i; // position index
Expand Down Expand Up @@ -353,66 +354,6 @@ inline int write_segment(global uchar *input_buffer, // buffer with input uncomp
return start_cmp;
}

/*
* Perform the actual compression by copying
*
* return the end-position in the output stream
*/
inline int write_lz4(local uchar *buffer,
local volatile short4 *segments, // size of the workgroup
int nb_segments,
int start_cmp,
global uchar *output_buffer,
int stop, //output buffer max size
int continuation // set to 0 to indicate this is the last segment
)
{
for (int i=0; i<nb_segments; i++){
short4 segment = segments[i];
if ((segment.s1==0) && (segment.s2==0)){// this was the last segment
break;
}
//write token
int token_idx = start_cmp++;
int rem;
int litter = segment.s1;
int match = segment.s2;
if (litter >= 15){
segment.s1 = 15;
rem = litter - 15;
while (rem>=255){
output_buffer[start_cmp++] = 255;
rem -= 255;
}
output_buffer[start_cmp++] = rem;
}
if (match >= 19){
segment.s2 = 19;
}
output_buffer[token_idx] = build_token((int4)(segment.s0, segment.s1, segment.s2, segment.s3));

//copy litteral. This is collaborative.
start_cmp = copy_local(output_buffer, start_cmp,
buffer, segment.s0, litter);

if ((continuation)||(i+1<nb_segments)){ // last block has no offset, nor match
//write offset, here always 1 in 16 bits little endian !
output_buffer[start_cmp++] = 1;
output_buffer[start_cmp++] = 0;

//write match overflow
if (segment.s2>=19){
rem = segment.s2-19;
while (rem>=255){
output_buffer[start_cmp++] = 255;
rem -= 255;
}
output_buffer[start_cmp++] = rem;
}
}
}//loop over segments
return start_cmp;
}

// calculate the length of a segment in compressed form
inline int len_segment(int4 segment){
Expand Down Expand Up @@ -582,48 +523,7 @@ inline int2 concatenate_segments(
return (int2) (shared_idx[0], shared_idx[1]);
} // end concatenate_segments

/* Main kernel for lz4 compression
*/
kernel void lz4_cmp( global uchar *input_buffer,
int input_size,
global uchar *output_buffer,
int output_size,
global uchar *output_ptr, // Length of all output from different wg
global int *running_grp, // counter with the number of wg still running
local uchar *buffer,
int buffer_size,
local short *match_buffer, // size of the buffer
local volatile short4 *segments // contains: start of segment (uncompressed), number of litterals, number of match (offset is enforced to 1) and start of segment (compressed)
){
int tid = get_local_id(0); // thread id
int gid = get_group_id(0); // group id
int wg = get_local_size(0);// workgroup size

//copy input data to buffer
int actual_buffer_size = min(buffer_size, input_size - ((gid+1) * buffer_size));
int start_block = gid * buffer_size;
for (int i=tid; i<actual_buffer_size; i+=wg){
buffer[i] = input_buffer[start_block+i];
}
local int cnt[2]; // small counters

/// divide the work in parts, one wg has enough threads
int start = 0;
// while (start<actual_buffer_size){
//scan for matching
// int next_start = scan4match(buffer, start, actual_buffer_size, match_buffer);
// extract from matching the sequence


// start = next_start;
// }





}

// test kernel to ensure `sort_odd_even` works
kernel void test_sort(global short4 *buffer,
int start,
Expand Down Expand Up @@ -662,7 +562,7 @@ kernel void test_scan4match(
start, stop,
lmatch,
cnt);
if ((tid==0) && (gid==0))printf("scanned up to %d\n", res);
// if ((tid==0) && (gid==0))printf("scanned up to %d\n", res);
//copy back
if (tid<stop-start){
match[tid] = lmatch[tid];
Expand All @@ -676,7 +576,7 @@ kernel void test_segmentation(global uchar *buffer,
int start, //index where scan should start
int stop,
global int *nbsegment,
global short4 *segments // size of the workgroup
global int4 *segments // size of the workgroup
){
local volatile int cnt[2];
local volatile int seg[1];
Expand All @@ -696,11 +596,11 @@ kernel void test_segmentation(global uchar *buffer,
start, stop,
lmatch,
cnt);
if ((tid==0) && (gid==0))printf("scanned up to %d\n", res);
// if ((tid==0) && (gid==0))printf("scanned up to %d\n", res);
int res2 = segmentation(start, stop, res, lmatch, lsegments, seg);
nbsegment[0] = res2;
if (tid<res2){
segments[tid] = lsegments[tid];
segments[tid] = convert_int4(lsegments[tid]);
}
}

Expand Down Expand Up @@ -779,90 +679,6 @@ kernel void test_multi(global uchar *buffer,
}
}

// kernel to test multiple scan4match+segmentation+write WG<64 buffer<1024.
kernel void test_write(global uchar *buffer,
int start, //index where scan should start
int stop,
global int *nbsegment,
global short4 *segments, // size of the workgroup
global uchar *output, // output buffer
global int *output_size // output buffer size, max in input, actual value in output
){
local volatile int seg[2]; // #0:number of segments in local mem, #1 in global mem
local volatile int cnt[1]; // end position of the scan
local volatile short4 lsegments[SEGMENT_SIZE];
local uchar lbuffer[BUFFER_SIZE];
local short lmatch[WORKGROUP_SIZE];


int tid = get_local_id(0); // thread id
int gid = get_group_id(0); // group id
int wg = get_local_size(0);// workgroup size
int actual_buffer_size = min(BUFFER_SIZE, stop);
int watchdog = (stop-start+wg-1)/wg; //prevent code from running way !
int res, res2, out_ptr=0, max_out=output_size[0];

//copy input to local buffer
for (int i=tid; i<stop; i+=wg){
lbuffer[i] = buffer[i];
}
if (tid<2){
seg[tid] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);

while ((watchdog--) && (start+1<actual_buffer_size)){
//scan for matching
res = scan4match(lbuffer, start, stop, lmatch, cnt);
res2 = segmentation(start, stop, res, lmatch, lsegments, seg);
if ((tid==0) && (gid==0)){
for (int i=0; i<res2; i++){
short4 seg = lsegments[i];
}
}
// copy segments to global memory:
if (tid+1<res2){
segments[seg[1] + tid] = lsegments[tid];
}
// copy data to compressed buffer
if (res2>1)
out_ptr = write_lz4(lbuffer, lsegments,
res2-1, // -1? to keep the last for concatenation
out_ptr, output,max_out, 1);

barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0){
seg[1] += res2-1;
lsegments[0] = lsegments[res2-1];
seg[0] = 1;
// short4 seg = lsegments[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
//memset local segments above first one,
if (tid>1) lsegments[tid] = (short4)(0,0,0,0);
barrier(CLK_LOCAL_MEM_FENCE);
start = res;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0){
short4 segment = lsegments[0];
segment.s1 += segment.s2;
segment.s2 = 0;
lsegments[0] = segment;

segments[seg[1]++] = segment;
nbsegment[0] = seg[1];
printf("last segment %d %d %d %d\n", segment.s0, segment.s1, segment.s2, segment.s3);
}
// write last segment

out_ptr = write_lz4(lbuffer, lsegments,
1, out_ptr, output, max_out, 0);

output_size[0] = out_ptr;


}

// kernel to test the function `concatenate_segments`, run on only one workgroup
kernel void test_concatenate_segments(
Expand All @@ -889,6 +705,8 @@ kernel void test_concatenate_segments(
// segment description: s0: position in input buffer s1: number of litterals, s2: number of match, s3: position in output buffer
kernel void LZ4_cmp_stage1(global uchar *buffer,
int input_size,
local uchar *lbuffer, // local buffer of size block_size for caching buffer.
int block_size, // size of the block
global int2 *segment_ptr, // size = number of workgroup launched, contains start and stop position
global int4 *segments, // size of the block-size (i.e. 1-8k !wg) / 4 * number of workgroup
int final_compaction, // set to 0 to prevent the final compaction. allows the analysis of intermediate results
Expand All @@ -898,7 +716,7 @@ kernel void LZ4_cmp_stage1(global uchar *buffer,
local volatile int seg[2]; // #0:number of segments in local mem, #1 in global mem
local volatile int cnt[2]; // end position of the scan
local volatile short4 lsegments[SEGMENT_SIZE];
local uchar lbuffer[BUFFER_SIZE];
// local uchar lbuffer[BUFFER_SIZE];
local short lmatch[WORKGROUP_SIZE];
local volatile int4 last_segment[1];

Expand All @@ -908,22 +726,25 @@ kernel void LZ4_cmp_stage1(global uchar *buffer,
int wg = get_local_size(0);// workgroup size
int ng = get_num_groups(0);// number of groups

// if (BUFFER_SIZE<block_size){
// if (get_global_id(0)==0)
// printf("block_size (%d) > BUFFER_SIZE (%d): Aborting!!!\n",block_size, BUFFER_SIZE);
// }

int output_block_size = 0;
int output_idx = output_block_size*gid;
int2 seg_ptr = segment_ptr[gid];
int segment_idx = seg_ptr.s0;
int segment_max = seg_ptr.s1;
// if (tid==0)printf("gid %d writes segments in range %d-%d\n", gid, segment_idx, segment_max);
int local_start = 0;
int global_start = BUFFER_SIZE*gid;
int local_stop = min(BUFFER_SIZE, input_size - global_start);
int global_start = block_size * gid;
int local_stop = min(block_size, input_size - global_start);
if (local_stop<=0){
if (tid==0)printf("gid %d local_stop: %d \n",gid, local_stop);
return;
}

// int actual_buffer_size = min(BUFFER_SIZE, local_stop) ;


int watchdog = (local_stop + wg-1)/wg; //prevent code from running way !
int res, res2, out_ptr=0, max_out=output_size[0];

Expand Down
Loading

0 comments on commit d6b7983

Please sign in to comment.