diff --git a/meson.build b/meson.build index fa98e58d01..f3848b76a3 100644 --- a/meson.build +++ b/meson.build @@ -130,6 +130,7 @@ taisei_c_args = [ '-fno-math-errno', '-fno-signaling-nans', '-fno-trapping-math', + '-mfp16-format=ieee', ] deprecation_warnings = get_option('deprecation_warnings') @@ -313,6 +314,128 @@ config.set('TAISEI_BUILDCONF_HAVE_ATTR_MALLOC_WITH_ARGS', cc.compiles( args : ['-Wattributes', '-Werror'] )) +config.set('TAISEI_BUILDCONF_F16_CVT_TYPE', false) +config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', 'uint16_t') +config.set('TAISEI_BUILDCONF_F16_SIMD_TYPE', false) +config.set('TAISEI_BUILDCONF_F16_RT_ABI_TYPE', false) +config.set('TAISEI_BUILDCONF_F16_RT_FUNC_H2F', false) +config.set('TAISEI_BUILDCONF_F16_RT_FUNC_F2H', false) + +float16_cvt_types = { + # cvt : storage + '_Float16' : '_Float16', + '__fp16' : 'uint16_t', +} +float16_simd_types = ['_Float16'] +float16_rt_abi = 'none' +float16_rt_abi_typemap = { + 'native' : '_Float16', + 'integer' : 'uint16_t', +} +float16_rt_abi_choices = ['auto', 'none'] + float16_rt_abi_typemap.keys() +float16_rt_funcs = [ + [ '__extendhfsf2', '__truncsfhf2' ], + [ '__gnu_h2f_ieee', '__gnu_f2h_ieee' ], +] + +float16_rt_abi = meson.get_external_property('float16_rt_abi', 'auto') +if float16_rt_abi not in float16_rt_abi_choices + error('float16_rt_abi must be one of @0@'.format(', '.join(float16_rt_abi_choices))) +endif + +float16_have_native_conversion = false +float16_have_rtlib_conversion = false + +foreach cvt_type, storage_type : float16_cvt_types + if cc.sizeof(cvt_type) == 2 + config.set('TAISEI_BUILDCONF_F16_CVT_TYPE', cvt_type) + config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', storage_type) + + if cvt_type in float16_simd_types + config.set('TAISEI_BUILDCONF_F16_SIMD_TYPE', cvt_type) + endif + + float16_rt_abi = 'none' + float16_have_native_conversion = true + break + endif +endforeach + +if float16_rt_abi != 'none' + foreach funcs : float16_rt_funcs + func_h2f = funcs[0] + func_f2h = funcs[1] + + if not cc.has_function(func_h2f) or not cc.has_function(func_f2h) + continue + endif + + if float16_rt_abi == 'auto' + float16_rt_abi = 'none' + + foreach abiname, abitype : float16_rt_abi_typemap + r = cc.run(f''' + #include + #include + + #define NOINLINE __attribute__((noinline)) + + typedef @abitype@ f16_abi_t; + float @func_h2f@(f16_abi_t); + f16_abi_t @func_f2h@(float); + + NOINLINE float f16_to_f32(uint16_t x) { + union { + f16_abi_t _f16abi; + uint16_t _uint; + } u = { ._uint = x }; + return @func_h2f@(u._f16abi); + } + + NOINLINE uint16_t f32_to_f16(float x) { + union { + f16_abi_t _f16abi; + uint16_t _uint; + } u = { ._f16abi = @func_f2h@(x) }; + return u._uint; + } + + int main(int argc, char **argv) { + volatile float src = 420.69f; + const float expected = 420.75f; + volatile uint16_t half = f32_to_f16(src); + volatile float roundtrip = f16_to_f32(half); + assert(roundtrip == expected); + return 0; + } + ''', name : f'Test for @abiname@ float16 ABI') + + if r.compiled() and r.returncode() == 0 + float16_rt_abi = abiname + break + endif + endforeach + endif + + if float16_rt_abi == 'none' + break + endif + + abi_type = float16_rt_abi_typemap[float16_rt_abi] + config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', abi_type) + config.set('TAISEI_BUILDCONF_F16_RT_ABI_TYPE', abi_type) + config.set('TAISEI_BUILDCONF_F16_RT_FUNC_H2F', func_h2f) + config.set('TAISEI_BUILDCONF_F16_RT_FUNC_F2H', func_f2h) + + if abi_type in float16_simd_types + config.set('TAISEI_BUILDCONF_F16_SIMD_TYPE', abi_type) + endif + + float16_have_rtlib_conversion = true + break + endforeach +endif + prefer_relpath_systems = [ 'windows', ] diff --git a/src/renderer/api.h b/src/renderer/api.h index c77422efe5..e56e439e71 100644 --- a/src/renderer/api.h +++ b/src/renderer/api.h @@ -229,6 +229,7 @@ typedef enum Primitive { typedef enum VertexAttribType { VA_FLOAT, + VA_HALF, VA_BYTE, VA_UBYTE, VA_SHORT, @@ -482,15 +483,10 @@ typedef struct SpriteParamsBuffer { typedef struct SpriteInstanceAttribs { mat4 mv_transform; mat4 tex_transform; - - union { - FloatRect texrect; - vec4 texrect_vec4; - }; - - Color rgba; - FloatExtent sprite_size; - ShaderCustomParams custom; + FloatRect texrect; + float16_storage_t sprite_size[2]; + float16_storage_t rgba[4]; + float16_storage_t custom[4]; // offsetof(end_of_fields) == size without padding. char end_of_fields; diff --git a/src/renderer/common/sprite_batch.c b/src/renderer/common/sprite_batch.c index 3827094777..e637353b85 100644 --- a/src/renderer/common/sprite_batch.c +++ b/src/renderer/common/sprite_batch.c @@ -64,7 +64,7 @@ void _r_sprite_batch_init(void) { size_t sz_vert = sizeof(GenericModelVertex); size_t sz_attr = SIZEOF_SPRITE_ATTRIBS; - #define VERTEX_OFS(attr) offsetof(GenericModelVertex, attr) + #define VERTEX_OFS(attr) offsetof(GenericModelVertex, attr) #define INSTANCE_OFS(attr) offsetof(SpriteInstanceAttribs, attr) VertexAttribFormat fmt[] = { @@ -83,10 +83,10 @@ void _r_sprite_batch_init(void) { { { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(tex_transform[1]), 1 }, { { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(tex_transform[2]), 1 }, { { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(tex_transform[3]), 1 }, - { { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(rgba), 1 }, + { { 4, VA_HALF, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(rgba), 1 }, { { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(texrect), 1 }, - { { 2, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(sprite_size), 1 }, - { { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(custom), 1 }, + { { 2, VA_HALF, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(sprite_size), 1 }, + { { 4, VA_HALF, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(custom), 1 }, }; #undef VERTEX_OFS @@ -213,12 +213,9 @@ static void _r_sprite_batch_compute_attribs( glm_translate(attribs.mv_transform, (vec3) { ofs.x / imgdims.w, ofs.y / imgdims.h }); } - if(params->color == NULL) { - // XXX: should we use r_color_current here? - attribs.rgba = *RGBA(1, 1, 1, 1); - } else { - attribs.rgba = *params->color; - } + // XXX: should we default to r_color_current here? + const Color *color = params->color ?: RGBA(1, 1, 1, 1); + f32v4_to_f16v4(attribs.rgba, color->rgba); attribs.texrect = spr->tex_area; @@ -232,12 +229,12 @@ static void _r_sprite_batch_compute_attribs( attribs.texrect.h *= -1; } - attribs.sprite_size = spr->extent; + f32v2_to_f16v2(attribs.sprite_size, spr->extent.as_array); if(params->shader_params == NULL) { - memset(&attribs.custom, 0, sizeof(attribs.custom)); + memset(attribs.custom, 0, sizeof(attribs.custom)); } else { - attribs.custom = *params->shader_params; + f32v4_to_f16v4(attribs.custom, params->shader_params->vector); } *out_attribs = attribs; diff --git a/src/renderer/gl33/vertex_array.c b/src/renderer/gl33/vertex_array.c index ad3397b7ac..75b5f288ed 100644 --- a/src/renderer/gl33/vertex_array.c +++ b/src/renderer/gl33/vertex_array.c @@ -16,6 +16,7 @@ static GLenum va_type_to_gl_type[] = { [VA_FLOAT] = GL_FLOAT, + [VA_HALF] = GL_HALF_FLOAT, [VA_BYTE] = GL_BYTE, [VA_UBYTE] = GL_UNSIGNED_BYTE, [VA_SHORT] = GL_SHORT, diff --git a/src/resource/font.c b/src/resource/font.c index cb1211c5e1..a6394ff7eb 100644 --- a/src/resource/font.c +++ b/src/resource/font.c @@ -1047,21 +1047,14 @@ static double _text_ucs4_draw(Font *font, const uint32_t *ucs4text, const TextPa text_ucs4_bbox(font, ucs4text, 0, &bbox); - Color color; - - if(params->color == NULL) { - // XXX: sprite batch code defaults this to RGB(1, 1, 1) - color = *r_color_current(); - } else { - color = *params->color; - } - - ShaderCustomParams shader_params; + SpriteInstanceAttribs init_attribs = {}; + // XXX: sprite batch code defaults this to RGBA(1, 1, 1, 1) + f32v4_to_f16v4(init_attribs.rgba, (params->color ?: r_color_current())->rgba); if(params->shader_params == NULL) { - memset(&shader_params, 0, sizeof(shader_params)); + memset(init_attribs.custom, 0, sizeof(init_attribs.custom)); } else { - shader_params = *params->shader_params; + f32v4_to_f16v4(init_attribs.custom, params->shader_params->vector); } mat4 mat_texture; @@ -1131,9 +1124,7 @@ static double _text_ucs4_draw(Font *font, const uint32_t *ucs4text, const TextPa Sprite *spr = &glyph->sprite; set_batch_texture(&batch_state_params, spr->tex); - SpriteInstanceAttribs attribs; - attribs.rgba = color; - attribs.custom = shader_params; + SpriteInstanceAttribs attribs = init_attribs; float g_x = x + glyph->metrics.bearing_x + spr->w * 0.5; float g_y = y - glyph->metrics.bearing_y + spr->h * 0.5 - font->metrics.descent; @@ -1147,8 +1138,7 @@ static double _text_ucs4_draw(Font *font, const uint32_t *ucs4text, const TextPa attribs.texrect = spr->tex_area; // NOTE: Glyphs have their sprite w/h unadjusted for scale. - attribs.sprite_size.w = spr->w * iscale; - attribs.sprite_size.h = spr->h * iscale; + f32v2_to_f16v2(attribs.sprite_size, (float[2]) { spr->w * iscale, spr->h * iscale }); if(params->glyph_callback.func != NULL) { params->glyph_callback.func(font, uchar, &attribs, params->glyph_callback.userdata); diff --git a/src/stagedraw.c b/src/stagedraw.c index 19aea1d3f8..1d6d872d37 100644 --- a/src/stagedraw.c +++ b/src/stagedraw.c @@ -1092,7 +1092,7 @@ static int draw_numeric_callback(Font *font, charcode_t charcode, SpriteInstance st->color1 = st->color2; } - spr_attribs->rgba = *st->color1; + f32v4_to_f16v4(spr_attribs->rgba, st->color1->rgba); return 0; } diff --git a/src/util.h b/src/util.h index 1a04b05d97..4919f050c8 100644 --- a/src/util.h +++ b/src/util.h @@ -21,6 +21,7 @@ #include "util/miscmath.h" // #include "util/pngcruft.h" #include "util/stringops.h" +#include "util/float16.h" // FIXME: might not be the best place for these #include "log.h" diff --git a/src/util/float16.h b/src/util/float16.h new file mode 100644 index 0000000000..d1f7f610e6 --- /dev/null +++ b/src/util/float16.h @@ -0,0 +1,148 @@ +/* + * This software is licensed under the terms of the MIT License. + * See COPYING for further information. + * --- + * Copyright (c) 2011-2019, Lukas Weber . + * Copyright (c) 2012-2019, Andrei Alexeyev . +*/ + +#pragma once +#include "taisei.h" + +/* + * NOTE: This is a storage-only format. You must not directly initialize it or perform math + * operations on it. + */ +typedef struct float16_storage { + TAISEI_BUILDCONF_F16_STORAGE_TYPE _storage; +} float16_storage_t; + +#if defined(TAISEI_BUILDCONF_F16_CVT_TYPE) + +// Compiler has native support for float16 conversions through a special type. +// Note that it might not be possible to return that type from functions or pass it as arguments +// directly. + +typedef TAISEI_BUILDCONF_F16_CVT_TYPE float16_cvt_t; + +union f16_cvt { + float16_cvt_t as_cvt; + float16_storage_t as_storage; +}; + +attr_const +INLINE float16_storage_t f32_to_f16(float x) { + assert(isfinite(x)); + return ((union f16_cvt) { .as_cvt = x }).as_storage; +} + +attr_const +INLINE float f16_to_f32(float16_storage_t x) { + return ((union f16_cvt) { .as_storage = x }).as_cvt; +} + +#elif \ + defined(TAISEI_BUILDCONF_F16_RT_ABI_TYPE) && \ + defined(TAISEI_BUILDCONF_F16_RT_FUNC_F2H) && \ + defined(TAISEI_BUILDCONF_F16_RT_FUNC_H2F) + +// Conversion functions are available as part of the runtime library + +typedef TAISEI_BUILDCONF_F16_RT_ABI_TYPE float16_rtabi_t; + +float TAISEI_BUILDCONF_F16_RT_FUNC_H2F(float16_rtabi_t); +float16_rtabi_t TAISEI_BUILDCONF_F16_RT_FUNC_F2H(float); + +union f16_rtabi_cvt { + float16_rtabi_t as_rtabi; + float16_storage_t as_storage; +}; + +attr_const +INLINE float16_storage_t f32_to_f16(float x) { + assert(isfinite(x)); + return ((union f16_rtabi_cvt) { + .as_rtabi = TAISEI_BUILDCONF_F16_RT_FUNC_F2H(x) + }).as_storage; +} + +attr_const +INLINE float f16_to_f32(float16_storage_t x) { + return TAISEI_BUILDCONF_F16_RT_FUNC_H2F( + ((union f16_rtabi_cvt) { .as_storage = x }).as_rtabi + ); +} + +#else + +// Resort to vendored fallbacks + +float16_storage_t f32_to_f16(float x) attr_const; +float f16_to_f32(float16_storage_t x) attr_const; + +#endif + +// Vector operations + +#define F16_DEFINE_VECTOR_CONVERSION_SCALAR(vecsize) \ + INLINE void f32v##vecsize##_to_f16v##vecsize(float16_storage_t dst[vecsize], const float src[vecsize]) { \ + for(int i = 0; i < vecsize; ++i) { \ + dst[i] = f32_to_f16(src[i]); \ + } \ + } \ + \ + INLINE void f16v##vecsize##_to_f32v##vecsize(float dst[vecsize], const float16_storage_t src[vecsize]) { \ + for(int i = 0; i < vecsize; ++i) { \ + dst[i] = f16_to_f32(src[i]); \ + } \ + } + +#ifdef TAISEI_BUILDCONF_F16_SIMD_TYPE + +typedef TAISEI_BUILDCONF_F16_SIMD_TYPE f16_simd_t; + +// NOTE: Sadly GCC 12 still can't vectorize this, but clang can. + +#define F16_DEFINE_VECTOR_CONVERSION(vecsize) \ + typedef float f32v##vecsize##simd __attribute__((vector_size(vecsize * sizeof(float)))); \ + typedef f16_simd_t f16v##vecsize##simd __attribute__((vector_size(vecsize * sizeof(f16_simd_t)))); \ + \ + INLINE void f32v##vecsize##_to_f16v##vecsize(float16_storage_t dst[vecsize], const float src[vecsize]) { \ + f32v##vecsize##simd v32_simd; \ + memcpy(&v32_simd, src, sizeof(v32_simd)); \ + auto v16_simd = __builtin_convertvector(v32_simd, f16v##vecsize##simd); \ + memcpy(dst, &v16_simd, sizeof(v16_simd)); \ + } \ + \ + INLINE void f16v##vecsize##_to_f32v##vecsize(float dst[vecsize], const float16_storage_t src[vecsize]) { \ + f16v##vecsize##simd v16_simd; \ + memcpy(&v16_simd, src, sizeof(v16_simd)); \ + auto v32_simd = __builtin_convertvector(v16_simd, f32v##vecsize##simd); \ + memcpy(dst, &v32_simd, sizeof(v32_simd)); \ + } + +#else + +#define F16_DEFINE_VECTOR_CONVERSION(vecsize) \ + F16_DEFINE_VECTOR_CONVERSION_SCALAR(vecsize) + +#endif + +/* + * Defines functions: + * + * void f16vX_to_f32vX(float dst[X], const float16_storage_t src[X]); + * void f32vX_to_f16vX(float16_storage_t dst[X], const float src[X]); + * + * Where X is the vector size. + */ + +F16_DEFINE_VECTOR_CONVERSION(4) + +#ifdef __clang__ + F16_DEFINE_VECTOR_CONVERSION(3) +#else + F16_DEFINE_VECTOR_CONVERSION_SCALAR(3) +#endif + +F16_DEFINE_VECTOR_CONVERSION(2) diff --git a/src/util/float16_fallback.c b/src/util/float16_fallback.c new file mode 100644 index 0000000000..cc7e155725 --- /dev/null +++ b/src/util/float16_fallback.c @@ -0,0 +1,40 @@ +/* + * This software is licensed under the terms of the MIT License. + * See COPYING for further information. + * --- + * Copyright (c) 2011-2019, Lukas Weber . + * Copyright (c) 2012-2019, Andrei Alexeyev . +*/ + +#include "taisei.h" + +#include "float16.h" + +// Evil bit hackery stolen from stack overflow: https://stackoverflow.com/a/60047308 + +float16_storage_t f32_to_f16(float x) { + assert(isfinite(x)); + // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits + uint32_t b = UNION_CAST(float, uint32_t, x) + 0x00001000; // round-to-nearest-even: add last bit after truncated mantissa + uint32_t e = (b & 0x7F800000) >> 23; // exponent + uint32_t m = (b & 0x007FFFFF); // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding + return UNION_CAST(uint16_t, float16_storage_t, + // sign : normalized : denormalized : saturate + (b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) | + ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) | + (e > 143) * 0x7FFF + ); +} + +float f16_to_f32(float16_storage_t f16) { + // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits + uint16_t x = UNION_CAST(float16_storage_t, uint16_t, f16); + uint32_t e = (x & 0x7C00) >> 10; // exponent + uint32_t m = (x & 0x03FF) << 13; // mantissa + uint32_t v = UNION_CAST(float, uint32_t, m) >> 23; // evil log2 bit hack to count leading zeros in denormalized format + return UNION_CAST(uint32_t, float, + // sign : normalized : denormalized + (x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) | + ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)) + ); +} diff --git a/src/util/meson.build b/src/util/meson.build index 6fe8aa53c9..df0fd3bb0f 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -43,3 +43,7 @@ if dep_gamemode.found() else util_src += files('gamemode_stub.c') endif + +if not (float16_have_native_conversion or float16_have_rtlib_conversion) + util_src += files('float16_fallback.c') +endif