From d6b79831a107be56c3e0d0d8b189b84d2d310800 Mon Sep 17 00:00:00 2001 From: Jerome Kieffer Date: Wed, 23 Aug 2023 17:26:23 +0200 Subject: [PATCH] several alternative implementations --- .../resources/opencl/codec/lz4_compression.cl | 227 +---- .../opencl/codec/lz4_compression_nocache.cl | 776 +++++++++++++++++ .../codec/lz4_compression_nocache_int32.cl | 784 ++++++++++++++++++ 3 files changed, 1584 insertions(+), 203 deletions(-) create mode 100644 src/silx/resources/opencl/codec/lz4_compression_nocache.cl create mode 100644 src/silx/resources/opencl/codec/lz4_compression_nocache_int32.cl diff --git a/src/silx/resources/opencl/codec/lz4_compression.cl b/src/silx/resources/opencl/codec/lz4_compression.cl index 4af8bf619e..1f12984461 100644 --- a/src/silx/resources/opencl/codec/lz4_compression.cl +++ b/src/silx/resources/opencl/codec/lz4_compression.cl @@ -30,15 +30,15 @@ // This is used in tests to simplify the signature of those test kernels. #ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#define WORKGROUP_SIZE 1024 #endif //segment size should be buffer_size/4 #ifndef SEGMENT_SIZE -#define SEGMENT_SIZE 256 +#define SEGMENT_SIZE 512 #endif #ifndef BUFFER_SIZE -#define BUFFER_SIZE 1024 +#define BUFFER_SIZE 16384 #endif #ifndef MIN_MATCH #define MIN_MATCH 4 @@ -138,11 +138,11 @@ inline int compact_segments(local volatile short4 *segments, /* This function scans the input data searching for litterals and matches. return the end-of-scan position. */ -inline int scan4match( local uchar *buffer, // buffer with input data in it, as large as possible, limited by shared memory space. - int start, - int stop, - local short *match_buffer, // size of the wg is enough - volatile local int* cnt // size 1 is enough, idx0: largest index value found +inline int scan4match(local uchar *buffer, // buffer with input data in it, as large as possible, limited by shared memory space. + int start, + int stop, + local short *match_buffer, // size of the wg is enough + volatile local int* cnt // size 1 is enough, idx0: largest index value found ){ int wg = get_local_size(0);// workgroup size @@ -151,6 +151,7 @@ inline int scan4match( local uchar *buffer, // buffer with input data in cnt[0] = 0; // memset match_buffer +// if (tid==0)printf("workgroup size is %d\n",WORKGROUP_SIZE); match_buffer[tid] = -1; barrier(CLK_LOCAL_MEM_FENCE); int i; // position index @@ -353,66 +354,6 @@ inline int write_segment(global uchar *input_buffer, // buffer with input uncomp return start_cmp; } -/* - * Perform the actual compression by copying - * - * return the end-position in the output stream - */ -inline int write_lz4(local uchar *buffer, - local volatile short4 *segments, // size of the workgroup - int nb_segments, - int start_cmp, - global uchar *output_buffer, - int stop, //output buffer max size - int continuation // set to 0 to indicate this is the last segment - ) -{ - for (int i=0; i= 15){ - segment.s1 = 15; - rem = litter - 15; - while (rem>=255){ - output_buffer[start_cmp++] = 255; - rem -= 255; - } - output_buffer[start_cmp++] = rem; - } - if (match >= 19){ - segment.s2 = 19; - } - output_buffer[token_idx] = build_token((int4)(segment.s0, segment.s1, segment.s2, segment.s3)); - - //copy litteral. This is collaborative. - start_cmp = copy_local(output_buffer, start_cmp, - buffer, segment.s0, litter); - - if ((continuation)||(i+1=19){ - rem = segment.s2-19; - while (rem>=255){ - output_buffer[start_cmp++] = 255; - rem -= 255; - } - output_buffer[start_cmp++] = rem; - } - } - }//loop over segments - return start_cmp; -} // calculate the length of a segment in compressed form inline int len_segment(int4 segment){ @@ -582,48 +523,7 @@ inline int2 concatenate_segments( return (int2) (shared_idx[0], shared_idx[1]); } // end concatenate_segments -/* Main kernel for lz4 compression - */ -kernel void lz4_cmp( global uchar *input_buffer, - int input_size, - global uchar *output_buffer, - int output_size, - global uchar *output_ptr, // Length of all output from different wg - global int *running_grp, // counter with the number of wg still running - local uchar *buffer, - int buffer_size, - local short *match_buffer, // size of the buffer - local volatile short4 *segments // contains: start of segment (uncompressed), number of litterals, number of match (offset is enforced to 1) and start of segment (compressed) - ){ - int tid = get_local_id(0); // thread id - int gid = get_group_id(0); // group id - int wg = get_local_size(0);// workgroup size - //copy input data to buffer - int actual_buffer_size = min(buffer_size, input_size - ((gid+1) * buffer_size)); - int start_block = gid * buffer_size; - for (int i=tid; i1) - out_ptr = write_lz4(lbuffer, lsegments, - res2-1, // -1? to keep the last for concatenation - out_ptr, output,max_out, 1); - - barrier(CLK_LOCAL_MEM_FENCE); - if (tid == 0){ - seg[1] += res2-1; - lsegments[0] = lsegments[res2-1]; - seg[0] = 1; -// short4 seg = lsegments[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - //memset local segments above first one, - if (tid>1) lsegments[tid] = (short4)(0,0,0,0); - barrier(CLK_LOCAL_MEM_FENCE); - start = res; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid == 0){ - short4 segment = lsegments[0]; - segment.s1 += segment.s2; - segment.s2 = 0; - lsegments[0] = segment; - - segments[seg[1]++] = segment; - nbsegment[0] = seg[1]; - printf("last segment %d %d %d %d\n", segment.s0, segment.s1, segment.s2, segment.s3); - } - // write last segment - - out_ptr = write_lz4(lbuffer, lsegments, - 1, out_ptr, output, max_out, 0); - - output_size[0] = out_ptr; - - -} // kernel to test the function `concatenate_segments`, run on only one workgroup kernel void test_concatenate_segments( @@ -889,6 +705,8 @@ kernel void test_concatenate_segments( // segment description: s0: position in input buffer s1: number of litterals, s2: number of match, s3: position in output buffer kernel void LZ4_cmp_stage1(global uchar *buffer, int input_size, + local uchar *lbuffer, // local buffer of size block_size for caching buffer. + int block_size, // size of the block global int2 *segment_ptr, // size = number of workgroup launched, contains start and stop position global int4 *segments, // size of the block-size (i.e. 1-8k !wg) / 4 * number of workgroup int final_compaction, // set to 0 to prevent the final compaction. allows the analysis of intermediate results @@ -898,7 +716,7 @@ kernel void LZ4_cmp_stage1(global uchar *buffer, local volatile int seg[2]; // #0:number of segments in local mem, #1 in global mem local volatile int cnt[2]; // end position of the scan local volatile short4 lsegments[SEGMENT_SIZE]; - local uchar lbuffer[BUFFER_SIZE]; +// local uchar lbuffer[BUFFER_SIZE]; local short lmatch[WORKGROUP_SIZE]; local volatile int4 last_segment[1]; @@ -908,6 +726,11 @@ kernel void LZ4_cmp_stage1(global uchar *buffer, int wg = get_local_size(0);// workgroup size int ng = get_num_groups(0);// number of groups +// if (BUFFER_SIZE BUFFER_SIZE (%d): Aborting!!!\n",block_size, BUFFER_SIZE); +// } + int output_block_size = 0; int output_idx = output_block_size*gid; int2 seg_ptr = segment_ptr[gid]; @@ -915,15 +738,13 @@ kernel void LZ4_cmp_stage1(global uchar *buffer, int segment_max = seg_ptr.s1; // if (tid==0)printf("gid %d writes segments in range %d-%d\n", gid, segment_idx, segment_max); int local_start = 0; - int global_start = BUFFER_SIZE*gid; - int local_stop = min(BUFFER_SIZE, input_size - global_start); + int global_start = block_size * gid; + int local_stop = min(block_size, input_size - global_start); if (local_stop<=0){ if (tid==0)printf("gid %d local_stop: %d \n",gid, local_stop); return; } - -// int actual_buffer_size = min(BUFFER_SIZE, local_stop) ; - + int watchdog = (local_stop + wg-1)/wg; //prevent code from running way ! int res, res2, out_ptr=0, max_out=output_size[0]; diff --git a/src/silx/resources/opencl/codec/lz4_compression_nocache.cl b/src/silx/resources/opencl/codec/lz4_compression_nocache.cl new file mode 100644 index 0000000000..7b95da3df2 --- /dev/null +++ b/src/silx/resources/opencl/codec/lz4_compression_nocache.cl @@ -0,0 +1,776 @@ +/* + * Project: SILX: Bitshuffle LZ4 compressor + * + * Copyright (C) 2023 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// This is used in tests to simplify the signature of those test kernels. +#ifndef WORKGROUP_SIZE +#define WORKGROUP_SIZE 128 +#endif + +//segment size should be buffer_size/4 +#ifndef SEGMENT_SIZE +#define SEGMENT_SIZE 256 +#endif + +#ifndef BUFFER_SIZE +#define BUFFER_SIZE 16384 +#endif +#ifndef MIN_MATCH +#define MIN_MATCH 4 +#endif + + + +// short compare and swap function used in sort_odd_even +inline short8 _order_short4(short4 a, short4 b){ + return (a.s0=w) && (tid0) && (pid=MIN_MATCH)){ + segments[atomic_inc(cnt)] = (short4)(pid+1, 0, here, 1); + } else +// if ((here==0) && (there>0) && (tid>5) && match_buffer[tid-5]>4){ + if ((here==0) && (there>0) && (tid>=MIN_MATCH) && match_buffer[tid-MIN_MATCH]>=MIN_MATCH){ + segments[atomic_inc(cnt)] = (short4)(pid+1, 0, 0, 0); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (cnt[0] == 1){ + // nothing occured in considered + if (tid == 0){ + // nothing occured, just complete former segment + short4 seg=segments[0]; + if (seg.s2 == 0){ //there was no match, just complete the former segment + seg.s1 += end-start; + segments[0] = seg; + } + else{ // noting occured, but former segment has already some match ! + if (tid==0){ + segments[atomic_inc(cnt)] = (short4)(start, end-start, 0, 0); + } + } + } + } + else{ + // sort segments + sort_odd_even(0, cnt[0], segments); + //add end position as a litteral + if (tid==0){ + segments[cnt[0]] = (short4)(end, 0, 0, 0); + atomic_inc(cnt); + } + barrier(CLK_LOCAL_MEM_FENCE); + cnt[0] = compact_segments(segments, cnt); + } + barrier(CLK_LOCAL_MEM_FENCE); + return cnt[0]; +} + + +// Build token, concatenation of a litteral and a match +inline uchar build_token(int4 segment){ + int lit = segment.s1; + int mat = segment.s2; + int token = ((lit & 15)<<4)|((mat-4)&15); + return token; +} + + +// copy collaborative, return the position in output stream. +inline int copy_global(global uchar* dest, + const int dest_position, + global uchar* source, + const int src_position, + const int length){ + for (int i=get_local_id(0); i=input_size) || (start_cmp>=output_size)){// this segment read/write outsize boundaries + return -1; + } + + if (last_segment){ + litter += match; + match = 0; + segment.s1 = litter; + segment.s2 = match; +// if(tid==0)printf("last segment %d %d %d %d\n", segment.s0, segment.s1, segment.s2, segment.s3); + } + + //write token + int token_idx = start_cmp++; + if (litter >= 15){ + segment.s1 = 15; + rem = litter - 15; + while (rem>=255){ + output_buffer[start_cmp++] = 255; + rem -= 255; + } + output_buffer[start_cmp++] = rem; + } + if (match >= 19){ + segment.s2 = 19; + } + output_buffer[token_idx] = build_token(segment); + + //copy litteral. This is collaborative. + start_cmp = copy_global(output_buffer, start_cmp, + input_buffer, start_dec, litter); + + if (!last_segment){ // last block has no offset, nor match + //write offset, here always 1 in 16 bits little endian ! + output_buffer[start_cmp++] = 1; + output_buffer[start_cmp++] = 0; + + //write match overflow + if (match>=19){ + rem = match-19; + while (rem>=255){ + output_buffer[start_cmp++] = 255; + rem -= 255; + } + output_buffer[start_cmp++] = rem; + } + } + return start_cmp; +} + + +// calculate the length of a segment in compressed form +inline int len_segment(int4 segment){ + int lit = segment.s1; + int mat = segment.s2-4; + int size = 3+lit; + if (lit>=15){ + size++; + lit -= 15; + } + while (lit>255){ + size++; + lit-=255; + } + if (mat>=15){ + size++; + mat -= 15; + } + while (mat>255){ + size++; + mat-=255; + } + return size; +} + + +/* store several local segments into the global memory starting at position. + * return the position in the output stream + */ +inline int store_segments(local volatile short4 *local_segments, + int nb_segments, + global int4 *global_segments, + int max_idx, // last position achievable in global segment array + int global_idx, + int input_stream_idx, + int output_stream_idx, + int block_size, // size of the block under analysis + int last, // set to true to concatenate the match and the litteral for last block + local volatile int* cnt //size=1 is eough + ){ + cnt[0] = output_stream_idx; + barrier(CLK_LOCAL_MEM_FENCE); + if (global_idx!=max_idx){ + //this is serial for conviniance ! + if (get_local_id(0)==0){ + for (int i=0; i %d\n", get_group_id(0), global_idx, max_idx, segment.s0, block_size); + segment.s1 = block_size - segment.s0; + segment.s2 = 0; + } + // manage last segment in block + if (last){ + segment.s1+=segment.s2; + segment.s2 = 0; + } + segment.s0 += input_stream_idx; + segment.s3 = output_stream_idx; + + output_stream_idx += len_segment(segment); + global_segments[global_idx++]=segment; + if (emergency) break; + } + cnt[0] = output_stream_idx; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + return cnt[0]; +} + + +/* concatenate all segments (stored in global memory) in such a way that they are adjacent. + * This function is to be called by the latest workgroup running. + * + * Returns the number of segments and the number of bytes to be written. + * + * There are tons of synchro since data are read and written from same buffer. + */ +inline int2 concatenate_segments( + global int2 *segment_ptr, // size = number of workgroup launched, contains start and stop position + global int4 *segments, // size of the block-size (i.e. 1-8k !wg) / 4 * number of workgroup + global int *output_size, // output buffer size, max in input, actual value in output + local volatile int *shared_idx, // shared indexes with segment offset(0), output_idx(1) + local volatile int4 *last_segment // shared memory with the last segment to share between threads + ){ + + int tid = get_local_id(0); // thread id + int gid = get_group_id(0); // group id + int wg = get_local_size(0);// workgroup size + int ng = get_num_groups(0);// number of groups + +// if (tid==0) printf("gid %d, running concat_segments \n", gid); + int4 segment; + barrier(CLK_GLOBAL_MEM_FENCE); + if (tid==0){ + shared_idx[0] = segment_ptr[0].s1; + shared_idx[1] = output_size[0]; + segment = segments[max(0, shared_idx[0]-1)]; + if ((segment.s0>0) && (segment.s2==0) && (ng>1)){ + last_segment[0] = segment; + shared_idx[0] -= 1; + } + else{ + last_segment[0] = (int4)(0,0,0,0); + } + } + barrier(CLK_LOCAL_MEM_FENCE); +// if (tid==0) printf("groups range from 1 to %d. segment_idx=%d, output_ptr=%d\n",ng, shared_idx[0], shared_idx[1]); + for (int grp=1; grp0) && (last_segment[0].s2==0)){ + segment = segments[seg_ptr.s0]; + segment.s0 = last_segment[0].s0; + segment.s1 = segment.s0+segment.s1-last_segment[0].s0; + shared_idx[1] += len_segment(segment)-len_segment(last_segment[0]); + last_segment[0] = (int4)(0,0,0,0); + segments[seg_ptr.s0] = segment; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i=low; i0) && (segment.s2==0) && (grp+11) lsegments[tid] = (short4)(0,0,0,0); + barrier(CLK_LOCAL_MEM_FENCE); + local_start = res; + } + barrier(CLK_LOCAL_MEM_FENCE); +// if (tid==0)printf("gid %d store final segments\n",gid); + output_idx = store_segments(lsegments, 1, // last segment is treated here + segments, segment_max, segment_idx, global_start, output_idx, global_stop, gid+1==ng, cnt); + output_size[gid] = output_idx; + seg_ptr.s1 = ++segment_idx; + segment_ptr[gid] = seg_ptr; + + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); + // last group running performs the cumsum and compaction of indices + if (tid==0){ + cnt[0] = (atomic_dec(wgcnt)==1); + } + barrier(CLK_LOCAL_MEM_FENCE); + if (cnt[0] && final_compaction){ + int2 end_ptr = concatenate_segments(segment_ptr, // size = number of workgroup launched, contains start and stop position + segments, // size of the block-size (i.e. 1-8k !wg) / 4 * number of workgroup + output_size, // output buffer size, max in input, actual value in output + cnt, // index of segment offset, shared + last_segment // shared memory with the last segment to share between threads + ); + } +} + + +// kernel launched with one segment per workgroup. If the segment has large litterals, having many threads per group is interesting. + +kernel void LZ4_cmp_stage2(global uchar *input_buffer, // bufffer with data to be compressed + int input_size, // size of the daa to be compressed + global int2 *segment_ptr, // size = 1 (often more) contains start and stop position of segment + global int4 *segments, // size defined by segment_ptr + global uchar *output_buffer,// destination buffer for compressed data + global int *output_size, // output buffer size, pre-filled (not updated if prefix_header), size should be at least 2 + int prefix_header // if set, put in header the input buffer size (increases the output_size[0] by 4) +){ + int gid = get_group_id(0); + int tid = get_local_id(0); + int wg = get_local_size(0); + int r_size = output_size[0]; + int2 segment_range = segment_ptr[0]; + if ((gid=segment_range.s1)) // out of range segment, should not occure ! + return; + int4 segment = segments[gid]; + if (prefix_header!=0){ + segment.s3 += 4; + if ((gid == 0) && (tid==0)){//write + output_buffer[0] = input_size & 0xFF; + output_buffer[1] = (input_size>>8) & 0xFF; + output_buffer[2] = (input_size>>16) & 0xFF; + output_buffer[3] = (input_size>>24) & 0xFF; + } + } + + if (gid+1==segment_range.s1){//last segment + r_size = write_segment(input_buffer, // buffer with input uncompressed data + input_size, // size of the data to be compressed + segment, // segment to be compressed + output_buffer, // destination buffer for compressed data + r_size, // + 1); + if (tid==0) output_size[1] = r_size; + } + else{ + write_segment(input_buffer, // buffer with input uncompressed data + input_size, // size of the data to be compressed + segment, // segment to be compressed + output_buffer, // destination buffer for compressed data + r_size, // + 0); + } +} diff --git a/src/silx/resources/opencl/codec/lz4_compression_nocache_int32.cl b/src/silx/resources/opencl/codec/lz4_compression_nocache_int32.cl new file mode 100644 index 0000000000..75ba2b93f2 --- /dev/null +++ b/src/silx/resources/opencl/codec/lz4_compression_nocache_int32.cl @@ -0,0 +1,784 @@ +/* + * Project: SILX: Bitshuffle LZ4 compressor + * + * Copyright (C) 2023 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + + +// This is used in tests to simplify the signature of those test kernels. +#ifndef WORKGROUP_SIZE +#define WORKGROUP_SIZE 128 +#endif + +#ifndef SEGMENT_SIZE +#define SEGMENT_SIZE 128 +#endif + +#ifndef BUFFER_SIZE +#define BUFFER_SIZE 16384 +#endif + +#ifndef MIN_MATCH +#define MIN_MATCH 4 +#endif + + + +// short compare and swap function used in sort_odd_even_int4 +inline int8 _order_int4(int4 a, int4 b){ + return (a.s0=w) && (tid0) && (pid=MIN_MATCH)){ + segments[atomic_inc(cnt)] = (int4)(pid+1, 0, here, 1); + } else +// if ((here==0) && (there>0) && (tid>5) && match_buffer[tid-5]>4){ + if ((here==0) && (there>0) && (tid>=MIN_MATCH) && match_buffer[tid-MIN_MATCH]>=MIN_MATCH){ + segments[atomic_inc(cnt)] = (int4)(pid+1, 0, 0, 0); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (cnt[0] == 1){ + // nothing occured in considered + if (tid == 0){ + // nothing occured, just complete former segment + int4 seg=segments[0]; + if (seg.s2 == 0){ //there was no match, just complete the former segment + seg.s1 += end-start; + segments[0] = seg; + } + else{ // noting occured, but former segment has already some match ! + if (tid==0){ + segments[atomic_inc(cnt)] = (int4)(start, end-start, 0, 0); + } + } + } + } + else{ + // sort segments + sort_odd_even_int4(0, cnt[0], segments); + //add end position as a litteral + if (tid==0){ + segments[cnt[0]] = (int4)(end, 0, 0, 0); + atomic_inc(cnt); + } + barrier(CLK_LOCAL_MEM_FENCE); + cnt[0] = compact_segments_int4(segments, cnt); + } + barrier(CLK_LOCAL_MEM_FENCE); + return cnt[0]; +} + + +// Build token, concatenation of a litteral and a match +inline uchar build_token(int4 segment){ + int lit = segment.s1; + int mat = segment.s2; + int token = ((lit & 15)<<4)|((mat-4)&15); + return token; +} + + +// copy collaborative, return the position in output stream. +inline int copy_global(global uchar* dest, + const int dest_position, + global uchar* source, + const int src_position, + const int length){ + for (int i=get_local_id(0); i=input_size) || (start_cmp>=output_size)){// this segment read/write outsize boundaries + return -1; + } + + if (last_segment){ + litter += match; + match = 0; + segment.s1 = litter; + segment.s2 = match; +// if(tid==0)printf("last segment %d %d %d %d\n", segment.s0, segment.s1, segment.s2, segment.s3); + } + + //write token + int token_idx = start_cmp++; + if (litter >= 15){ + segment.s1 = 15; + rem = litter - 15; + while (rem>=255){ + output_buffer[start_cmp++] = 255; + rem -= 255; + } + output_buffer[start_cmp++] = rem; + } + if (match >= 19){ + segment.s2 = 19; + } + output_buffer[token_idx] = build_token(segment); + + //copy litteral. This is collaborative. + start_cmp = copy_global(output_buffer, start_cmp, + input_buffer, start_dec, litter); + + if (!last_segment){ // last block has no offset, nor match + //write offset, here always 1 in 16 bits little endian ! + output_buffer[start_cmp++] = 1; + output_buffer[start_cmp++] = 0; + + //write match overflow + if (match>=19){ + rem = match-19; + while (rem>=255){ + output_buffer[start_cmp++] = 255; + rem -= 255; + } + output_buffer[start_cmp++] = rem; + } + } + return start_cmp; +} + + +// calculate the length of a segment in compressed form +inline int len_segment(int4 segment){ + int lit = segment.s1; + int mat = segment.s2-4; + int size = 3+lit; + if (lit>=15){ + size++; + lit -= 15; + } + while (lit>255){ + size++; + lit-=255; + } + if (mat>=15){ + size++; + mat -= 15; + } + while (mat>255){ + size++; + mat-=255; + } + return size; +} + + +/* store several local segments into the global memory starting at position. + * return the position in the output stream + */ +inline int store_segments_int4(local volatile int4 *local_segments, + int nb_segments, + global int4 *global_segments, + int max_idx, // last position achievable in global segment array + int global_idx, + int input_stream_idx, + int output_stream_idx, + int block_size, // size of the block under analysis + int last, // set to true to concatenate the match and the litteral for last block + local volatile int* cnt //size=1 is eough + ){ + cnt[0] = output_stream_idx; + barrier(CLK_LOCAL_MEM_FENCE); + if (global_idx!=max_idx){ + //this is serial for conviniance ! + if (get_local_id(0)==0){ + for (int i=0; i %d\n", (int)get_group_id(0), global_idx, max_idx, segment.s0, block_size); + segment.s1 = block_size - segment.s0; + segment.s2 = 0; + } + // manage last segment in block + if (last){ + segment.s1+=segment.s2; + segment.s2 = 0; + } + segment.s0 += input_stream_idx; + segment.s3 = output_stream_idx; + + output_stream_idx += len_segment(segment); + global_segments[global_idx++]=segment; + if (emergency) break; + } + cnt[0] = output_stream_idx; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + return cnt[0]; +} + + +/* concatenate all segments (stored in global memory) in such a way that they are adjacent. + * This function is to be called by the latest workgroup running. + * + * Returns the number of segments and the number of bytes to be written. + * + * There are tons of synchro since data are read and written from same buffer. + */ +inline int2 concatenate_segments( + global int2 *segment_ptr, // size = number of workgroup launched, contains start and stop position + global int4 *segments, // size of the block-size (i.e. 1-8k !wg) / 4 * number of workgroup + global int *output_size, // output buffer size, max in input, actual value in output + local volatile int *shared_idx, // shared indexes with segment offset(0), output_idx(1) + local volatile int4 *last_segment // shared memory with the last segment to share between threads + ){ + + int tid = get_local_id(0); // thread id + int gid = get_group_id(0); // group id + int wg = get_local_size(0);// workgroup size + int ng = get_num_groups(0);// number of groups + +// if (tid==0) printf("gid %d, running concat_segments \n", gid); + int4 segment; + barrier(CLK_GLOBAL_MEM_FENCE); + if (tid==0){ + shared_idx[0] = segment_ptr[0].s1; + shared_idx[1] = output_size[0]; + segment = segments[max(0, shared_idx[0]-1)]; + if ((segment.s0>0) && (segment.s2==0) && (ng>1)){ + last_segment[0] = segment; + shared_idx[0] -= 1; + } + else{ + last_segment[0] = (int4)(0,0,0,0); + } + } + barrier(CLK_LOCAL_MEM_FENCE); +// if (tid==0) printf("groups range from 1 to %d. segment_idx=%d, output_ptr=%d\n",ng, shared_idx[0], shared_idx[1]); + for (int grp=1; grp0) && (last_segment[0].s2==0)){ + segment = segments[seg_ptr.s0]; + segment.s0 = last_segment[0].s0; + segment.s1 = segment.s0+segment.s1-last_segment[0].s0; + shared_idx[1] += len_segment(segment)-len_segment(last_segment[0]); + last_segment[0] = (int4)(0,0,0,0); + segments[seg_ptr.s0] = segment; + } + barrier(CLK_LOCAL_MEM_FENCE); + for (int i=low; i0) && (segment.s2==0) && (grp+11) lsegments[tid] = (int4)(0,0,0,0); + barrier(CLK_LOCAL_MEM_FENCE); + local_start = res; + } + barrier(CLK_LOCAL_MEM_FENCE); +// if (tid==0)printf("gid %d store final segments\n",gid); + output_idx = store_segments_int4(lsegments, 1, // last segment is treated here + segments, segment_max, segment_idx, global_start, output_idx, global_stop, gid+1==ng, cnt); + output_size[gid] = output_idx; + seg_ptr.s1 = ++segment_idx; + segment_ptr[gid] = seg_ptr; + + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); + // last group running performs the cumsum and compaction of indices + if (tid==0){ + cnt[0] = (atomic_dec(wgcnt)==1); + } + barrier(CLK_LOCAL_MEM_FENCE); + if (cnt[0] && final_compaction){ + int2 end_ptr = concatenate_segments(segment_ptr, // size = number of workgroup launched, contains start and stop position + segments, // size of the block-size (i.e. 1-8k !wg) / 4 * number of workgroup + output_size, // output buffer size, max in input, actual value in output + cnt, // index of segment offset, shared + last_segment // shared memory with the last segment to share between threads + ); + } +} + + +// kernel launched with one segment per workgroup. If the segment has large litterals, having many threads per group is interesting. + +kernel void LZ4_cmp_stage2(global uchar *input_buffer, // bufffer with data to be compressed + int input_size, // size of the daa to be compressed + global int2 *segment_ptr, // size = 1 (often more) contains start and stop position of segment + global int4 *segments, // size defined by segment_ptr + global uchar *output_buffer,// destination buffer for compressed data + global int *output_size, // output buffer size, pre-filled (not updated if prefix_header), size should be at least 2 + int prefix_header // if set, put in header the input buffer size (increases the output_size[0] by 4) +){ + int gid = get_group_id(0); + int tid = get_local_id(0); + int wg = get_local_size(0); + int r_size = output_size[0]; + int2 segment_range = segment_ptr[0]; + if ((gid=segment_range.s1)) // out of range segment, should not occure ! + return; + int4 segment = segments[gid]; + if (prefix_header!=0){ + segment.s3 += 4; + if ((gid == 0) && (tid==0)){//write + output_buffer[0] = input_size & 0xFF; + output_buffer[1] = (input_size>>8) & 0xFF; + output_buffer[2] = (input_size>>16) & 0xFF; + output_buffer[3] = (input_size>>24) & 0xFF; + } + } + + if (gid+1==segment_range.s1){//last segment + r_size = write_segment(input_buffer, // buffer with input uncompressed data + input_size, // size of the data to be compressed + segment, // segment to be compressed + output_buffer, // destination buffer for compressed data + r_size, // + 1); + if (tid==0) output_size[1] = r_size; + } + else{ + write_segment(input_buffer, // buffer with input uncompressed data + input_size, // size of the data to be compressed + segment, // segment to be compressed + output_buffer, // destination buffer for compressed data + r_size, // + 0); + } +}