From 4301cc2cddd65c51049b7c399578547900ad3237 Mon Sep 17 00:00:00 2001
From: Andrei Alexeyev <akari@taisei-project.org>
Date: Fri, 13 Jan 2023 10:36:35 +0100
Subject: [PATCH 1/4] build,util/float16: add conversion routines between
 float32 and float16

Tries to use the compiler's native float16 type, such as _Float16 or
__fp16 first. This is the best case, because these conversions can
compile to single asm instructions on supported architectures, falling
back to calling a generic routine from the runtime library (libgcc or
compiler-rt).

Failing that, we'll try to call the rt library routines directly. This
is tricky, because there are no headers for those, and we must get the
ABI right. We must compile and run some code to detect the ABI, which is
not always possible when cross-compiling. For that reason, you can set
the `float16_rt_abi` property to either `native`, `integer`, or `none`
in the cross file.

The `native` ABI assumes that float16s are passed as the compiler's
_Float16 type (e.g. in xmm registers on x86_64). This is currently not
very useful, since on compilers that have _Float16 we'll simply use the
intrinsic conversions.

The `integer` ABI assumes that float16s are passed as uint16_t.

`none` means that no attempt to use the runtime library routines will be
made, even if they exist.

If we can use neither intrinsic conversions nor the runtime library,
we'll fall back to questionable bit hackery. This method currently does
not support infinities and NaNs.
---
 meson.build                 | 111 ++++++++++++++++++++++++++++++++++++
 src/util.h                  |   1 +
 src/util/float16.h          |  83 +++++++++++++++++++++++++++
 src/util/float16_fallback.c |  40 +++++++++++++
 src/util/meson.build        |   4 ++
 5 files changed, 239 insertions(+)
 create mode 100644 src/util/float16.h
 create mode 100644 src/util/float16_fallback.c

diff --git a/meson.build b/meson.build
index fa98e58d01..aa3804b4b5 100644
--- a/meson.build
+++ b/meson.build
@@ -130,6 +130,7 @@ taisei_c_args = [
     '-fno-math-errno',
     '-fno-signaling-nans',
     '-fno-trapping-math',
+    '-mfp16-format=ieee',
 ]
 
 deprecation_warnings = get_option('deprecation_warnings')
@@ -313,6 +314,116 @@ config.set('TAISEI_BUILDCONF_HAVE_ATTR_MALLOC_WITH_ARGS', cc.compiles(
     args : ['-Wattributes', '-Werror']
 ))
 
+config.set('TAISEI_BUILDCONF_F16_CVT_TYPE', false)
+config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', 'uint16_t')
+config.set('TAISEI_BUILDCONF_F16_RT_ABI_TYPE', false)
+config.set('TAISEI_BUILDCONF_F16_RT_FUNC_H2F', false)
+config.set('TAISEI_BUILDCONF_F16_RT_FUNC_F2H', false)
+
+float16_cvt_types = {
+    #      cvt : storage
+    '_Float16' : '_Float16',
+    '__fp16'   : 'uint16_t',
+}
+float16_rt_abi = 'none'
+float16_rt_abi_typemap = {
+    'native'  : '_Float16',
+    'integer' : 'uint16_t',
+}
+float16_rt_abi_choices = ['auto', 'none'] + float16_rt_abi_typemap.keys()
+float16_rt_funcs = [
+    [ '__extendhfsf2', '__truncsfhf2' ],
+    [ '__gnu_h2f_ieee', '__gnu_f2h_ieee' ],
+]
+
+float16_rt_abi = meson.get_external_property('float16_rt_abi', 'auto')
+if float16_rt_abi not in float16_rt_abi_choices
+    error('float16_rt_abi must be one of @0@'.format(', '.join(float16_rt_abi_choices)))
+endif
+
+float16_have_native_conversion = false
+float16_have_rtlib_conversion = false
+
+foreach cvt_type, storage_type : float16_cvt_types
+    if cc.sizeof(cvt_type) == 2
+        config.set('TAISEI_BUILDCONF_F16_CVT_TYPE', cvt_type)
+        config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', storage_type)
+        float16_rt_abi = 'none'
+        float16_have_native_conversion = true
+        break
+    endif
+endforeach
+
+if float16_rt_abi != 'none'
+    foreach funcs : float16_rt_funcs
+        func_h2f = funcs[0]
+        func_f2h = funcs[1]
+
+        if not cc.has_function(func_h2f) or not cc.has_function(func_f2h)
+            continue
+        endif
+
+        if float16_rt_abi == 'auto'
+            float16_rt_abi = 'none'
+
+            foreach abiname, abitype : float16_rt_abi_typemap
+                r = cc.run(f'''
+                    #include <assert.h>
+                    #include <stdint.h>
+
+                    #define NOINLINE __attribute__((noinline))
+
+                    typedef @abitype@ f16_abi_t;
+                    float @func_h2f@(f16_abi_t);
+                    f16_abi_t @func_f2h@(float);
+
+                    NOINLINE float f16_to_f32(uint16_t x) {
+                        union {
+                            f16_abi_t _f16abi;
+                            uint16_t _uint;
+                        } u = { ._uint =  x };
+                        return @func_h2f@(u._f16abi);
+                    }
+
+                    NOINLINE uint16_t f32_to_f16(float  x) {
+                        union {
+                            f16_abi_t _f16abi;
+                            uint16_t _uint;
+                        } u = { ._f16abi =  @func_f2h@(x) };
+                        return u._uint;
+                    }
+
+                    int main(int argc, char **argv) {
+                        volatile float src = 420.69f;
+                        const float expected = 420.75f;
+                        volatile uint16_t half = f32_to_f16(src);
+                        volatile float roundtrip = f16_to_f32(half);
+                        assert(roundtrip == expected);
+                        return 0;
+                    }
+                ''', name : f'Test for @abiname@ float16 ABI')
+
+                if r.compiled() and r.returncode() == 0
+                    float16_rt_abi = abiname
+                    break
+                endif
+            endforeach
+        endif
+
+        if float16_rt_abi == 'none'
+            break
+        endif
+
+        abi_type = float16_rt_abi_typemap[float16_rt_abi]
+        config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', abi_type)
+        config.set('TAISEI_BUILDCONF_F16_RT_ABI_TYPE', abi_type)
+        config.set('TAISEI_BUILDCONF_F16_RT_FUNC_H2F', func_h2f)
+        config.set('TAISEI_BUILDCONF_F16_RT_FUNC_F2H', func_f2h)
+        float16_have_rtlib_conversion = true
+        break
+    endforeach
+endif
+
 prefer_relpath_systems = [
     'windows',
 ]
diff --git a/src/util.h b/src/util.h
index 1a04b05d97..4919f050c8 100644
--- a/src/util.h
+++ b/src/util.h
@@ -21,6 +21,7 @@
 #include "util/miscmath.h"
 // #include "util/pngcruft.h"
 #include "util/stringops.h"
+#include "util/float16.h"
 
 // FIXME: might not be the best place for these
 #include "log.h"
diff --git a/src/util/float16.h b/src/util/float16.h
new file mode 100644
index 0000000000..a387d274c6
--- /dev/null
+++ b/src/util/float16.h
@@ -0,0 +1,83 @@
+/*
+ * This software is licensed under the terms of the MIT License.
+ * See COPYING for further information.
+ * ---
+ * Copyright (c) 2011-2019, Lukas Weber <laochailan@web.de>.
+ * Copyright (c) 2012-2019, Andrei Alexeyev <akari@taisei-project.org>.
+*/
+
+#pragma once
+#include "taisei.h"
+
+/*
+ * NOTE: This is a storage-only format. You must not directly initialize it or perform math
+ * operations on it.
+ */
+typedef struct float16_storage {
+	TAISEI_BUILDCONF_F16_STORAGE_TYPE _storage;
+} float16_storage_t;
+
+#if defined(TAISEI_BUILDCONF_F16_CVT_TYPE)
+
+// Compiler has native support for float16 conversions through a special type.
+// Note that it might not be possible to return that type from functions or pass it as arguments
+// directly.
+
+typedef TAISEI_BUILDCONF_F16_CVT_TYPE float16_cvt_t;
+
+union f16_cvt {
+	float16_cvt_t as_cvt;
+	float16_storage_t as_storage;
+};
+
+attr_const
+INLINE float16_storage_t f32_to_f16(float x) {
+	assert(isfinite(x));
+	return ((union f16_cvt) { .as_cvt = x }).as_storage;
+}
+
+attr_const
+INLINE float f16_to_f32(float16_storage_t x) {
+	return ((union f16_cvt) { .as_storage = x }).as_cvt;
+}
+
+#elif \
+	defined(TAISEI_BUILDCONF_F16_RT_ABI_TYPE) && \
+	defined(TAISEI_BUILDCONF_F16_RT_FUNC_F2H)	 && \
+	defined(TAISEI_BUILDCONF_F16_RT_FUNC_H2F)
+
+// Conversion functions are available as part of the runtime library
+
+typedef TAISEI_BUILDCONF_F16_RT_ABI_TYPE float16_rtabi_t;
+
+float TAISEI_BUILDCONF_F16_RT_FUNC_H2F(float16_rtabi_t);
+float16_rtabi_t TAISEI_BUILDCONF_F16_RT_FUNC_F2H(float);
+
+union f16_rtabi_cvt {
+	float16_rtabi_t as_rtabi;
+	float16_storage_t as_storage;
+};
+
+attr_const
+INLINE float16_storage_t f32_to_f16(float x) {
+	assert(isfinite(x));
+	return ((union f16_rtabi_cvt) {
+		.as_rtabi = TAISEI_BUILDCONF_F16_RT_FUNC_F2H(x)
+	}).as_storage;
+}
+
+attr_const
+INLINE float f16_to_f32(float16_storage_t x) {
+	return TAISEI_BUILDCONF_F16_RT_FUNC_H2F(
+		((union f16_rtabi_cvt) { .as_storage = x }).as_rtabi
+	);
+}
+
+#else
+
+// Resort to vendored fallbacks
+
+float f16_to_f32(float16_storage_t x) attr_const;
+float16_storage_t f32_to_f16(float x) attr_const;
+
+#endif
diff --git a/src/util/float16_fallback.c b/src/util/float16_fallback.c
new file mode 100644
index 0000000000..cc7e155725
--- /dev/null
+++ b/src/util/float16_fallback.c
@@ -0,0 +1,40 @@
+/*
+ * This software is licensed under the terms of the MIT License.
+ * See COPYING for further information.
+ * ---
+ * Copyright (c) 2011-2019, Lukas Weber <laochailan@web.de>.
+ * Copyright (c) 2012-2019, Andrei Alexeyev <akari@taisei-project.org>.
+*/
+
+#include "taisei.h"
+
+#include "float16.h"
+
+// Evil bit hackery stolen from stack overflow: https://stackoverflow.com/a/60047308
+
+float16_storage_t f32_to_f16(float x) {
+	assert(isfinite(x));
+	// IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
+	uint32_t b = UNION_CAST(float, uint32_t, x) + 0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
+	uint32_t e = (b & 0x7F800000) >> 23; // exponent
+	uint32_t m = (b & 0x007FFFFF); // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
+	return UNION_CAST(uint16_t, float16_storage_t,
+		// sign : normalized : denormalized : saturate
+		(b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
+		((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
+		(e > 143) * 0x7FFF
+	);
+}
+
+float f16_to_f32(float16_storage_t f16) {
+	// IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
+	uint16_t x = UNION_CAST(float16_storage_t, uint16_t, f16);
+	uint32_t e = (x & 0x7C00) >> 10; // exponent
+	uint32_t m = (x & 0x03FF) << 13; // mantissa
+	uint32_t v = UNION_CAST(float, uint32_t, m) >> 23; // evil log2 bit hack to count leading zeros in denormalized format
+	return UNION_CAST(uint32_t, float,
+		// sign : normalized : denormalized
+		(x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
+		((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000))
+	);
+}
diff --git a/src/util/meson.build b/src/util/meson.build
index 6fe8aa53c9..df0fd3bb0f 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -43,3 +43,7 @@ if dep_gamemode.found()
 else
     util_src += files('gamemode_stub.c')
 endif
+
+if not (float16_have_native_conversion or float16_have_rtlib_conversion)
+    util_src += files('float16_fallback.c')
+endif

From 03b27ffaf87685101ae8f67cc65952c7ba12498d Mon Sep 17 00:00:00 2001
From: Andrei Alexeyev <akari@taisei-project.org>
Date: Sat, 14 Jan 2023 14:28:45 +0100
Subject: [PATCH 2/4] util/float16: add vector/array conversion functions

These can be optimized into SIMD operations in some cases
---
 meson.build        | 12 +++++++++
 src/util/float16.h | 67 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/meson.build b/meson.build
index aa3804b4b5..f3848b76a3 100644
--- a/meson.build
+++ b/meson.build
@@ -316,6 +316,7 @@ config.set('TAISEI_BUILDCONF_HAVE_ATTR_MALLOC_WITH_ARGS', cc.compiles(
 
 config.set('TAISEI_BUILDCONF_F16_CVT_TYPE', false)
 config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', 'uint16_t')
+config.set('TAISEI_BUILDCONF_F16_SIMD_TYPE', false)
 config.set('TAISEI_BUILDCONF_F16_RT_ABI_TYPE', false)
 config.set('TAISEI_BUILDCONF_F16_RT_FUNC_H2F', false)
 config.set('TAISEI_BUILDCONF_F16_RT_FUNC_F2H', false)
@@ -325,6 +326,7 @@ float16_cvt_types = {
     '_Float16' : '_Float16',
     '__fp16'   : 'uint16_t',
 }
+float16_simd_types = ['_Float16']
 float16_rt_abi = 'none'
 float16_rt_abi_typemap = {
     'native'  : '_Float16',
@@ -348,6 +350,11 @@ foreach cvt_type, storage_type : float16_cvt_types
     if cc.sizeof(cvt_type) == 2
         config.set('TAISEI_BUILDCONF_F16_CVT_TYPE', cvt_type)
         config.set('TAISEI_BUILDCONF_F16_STORAGE_TYPE', storage_type)
+
+        if cvt_type in float16_simd_types
+            config.set('TAISEI_BUILDCONF_F16_SIMD_TYPE', cvt_type)
+        endif
+
         float16_rt_abi = 'none'
         float16_have_native_conversion = true
         break
@@ -419,6 +426,11 @@ if float16_rt_abi != 'none'
         config.set('TAISEI_BUILDCONF_F16_RT_ABI_TYPE', abi_type)
         config.set('TAISEI_BUILDCONF_F16_RT_FUNC_H2F', func_h2f)
         config.set('TAISEI_BUILDCONF_F16_RT_FUNC_F2H', func_f2h)
+
+        if abi_type in float16_simd_types
+            config.set('TAISEI_BUILDCONF_F16_SIMD_TYPE', abi_type)
+        endif
+
         float16_have_rtlib_conversion = true
         break
     endforeach
diff --git a/src/util/float16.h b/src/util/float16.h
index a387d274c6..d1f7f610e6 100644
--- a/src/util/float16.h
+++ b/src/util/float16.h
@@ -77,7 +77,72 @@ INLINE float f16_to_f32(float16_storage_t x) {
 
 // Resort to vendored fallbacks
 
-float f16_to_f32(float16_storage_t x) attr_const;
 float16_storage_t f32_to_f16(float x) attr_const;
+float f16_to_f32(float16_storage_t x) attr_const;
+
+#endif
+
+// Vector operations
+
+#define F16_DEFINE_VECTOR_CONVERSION_SCALAR(vecsize) \
+	INLINE void f32v##vecsize##_to_f16v##vecsize(float16_storage_t dst[vecsize], const float src[vecsize]) { \
+		for(int i = 0; i < vecsize; ++i) { \
+			dst[i] = f32_to_f16(src[i]); \
+		} \
+	} \
+	\
+	INLINE void f16v##vecsize##_to_f32v##vecsize(float dst[vecsize], const float16_storage_t src[vecsize]) { \
+		for(int i = 0; i < vecsize; ++i) { \
+			dst[i] = f16_to_f32(src[i]); \
+		} \
+	}
+
+#ifdef TAISEI_BUILDCONF_F16_SIMD_TYPE
+
+typedef TAISEI_BUILDCONF_F16_SIMD_TYPE f16_simd_t;
+
+// NOTE: Sadly GCC 12 still can't vectorize this, but clang can.
+
+#define F16_DEFINE_VECTOR_CONVERSION(vecsize) \
+	typedef float      f32v##vecsize##simd __attribute__((vector_size(vecsize * sizeof(float)))); \
+	typedef f16_simd_t f16v##vecsize##simd __attribute__((vector_size(vecsize * sizeof(f16_simd_t)))); \
+	\
+	INLINE void f32v##vecsize##_to_f16v##vecsize(float16_storage_t dst[vecsize], const float src[vecsize]) { \
+		f32v##vecsize##simd v32_simd; \
+		memcpy(&v32_simd, src, sizeof(v32_simd)); \
+		auto v16_simd = __builtin_convertvector(v32_simd, f16v##vecsize##simd); \
+		memcpy(dst, &v16_simd, sizeof(v16_simd)); \
+	} \
+	\
+	INLINE void f16v##vecsize##_to_f32v##vecsize(float dst[vecsize], const float16_storage_t src[vecsize]) { \
+		f16v##vecsize##simd v16_simd; \
+		memcpy(&v16_simd, src, sizeof(v16_simd)); \
+		auto v32_simd = __builtin_convertvector(v16_simd, f32v##vecsize##simd); \
+		memcpy(dst, &v32_simd, sizeof(v32_simd)); \
+	}
+
+#else
+
+#define F16_DEFINE_VECTOR_CONVERSION(vecsize) \
+	F16_DEFINE_VECTOR_CONVERSION_SCALAR(vecsize)
 
 #endif
+
+/*
+ * Defines functions:
+ *
+ *		void f16vX_to_f32vX(float dst[X], const float16_storage_t src[X]);
+ *		void f32vX_to_f16vX(float16_storage_t dst[X], const float src[X]);
+ *
+ * Where X is the vector size.
+ */
+
+F16_DEFINE_VECTOR_CONVERSION(4)
+
+#ifdef __clang__
+	F16_DEFINE_VECTOR_CONVERSION(3)
+#else
+	F16_DEFINE_VECTOR_CONVERSION_SCALAR(3)
+#endif
+
+F16_DEFINE_VECTOR_CONVERSION(2)

From 4df62c053b810c8f46ef36ba12add7b70a946487 Mon Sep 17 00:00:00 2001
From: Andrei Alexeyev <akari@taisei-project.org>
Date: Fri, 13 Jan 2023 11:03:20 +0100
Subject: [PATCH 3/4] renderer,gl33: support float16 data in vertex arrays

---
 src/renderer/api.h               | 1 +
 src/renderer/gl33/vertex_array.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/renderer/api.h b/src/renderer/api.h
index c77422efe5..fa75a4bf6f 100644
--- a/src/renderer/api.h
+++ b/src/renderer/api.h
@@ -229,6 +229,7 @@ typedef enum Primitive {
 
 typedef enum VertexAttribType {
 	VA_FLOAT,
+	VA_HALF,
 	VA_BYTE,
 	VA_UBYTE,
 	VA_SHORT,
diff --git a/src/renderer/gl33/vertex_array.c b/src/renderer/gl33/vertex_array.c
index ad3397b7ac..75b5f288ed 100644
--- a/src/renderer/gl33/vertex_array.c
+++ b/src/renderer/gl33/vertex_array.c
@@ -16,6 +16,7 @@
 
 static GLenum va_type_to_gl_type[] = {
 	[VA_FLOAT]  = GL_FLOAT,
+	[VA_HALF]   = GL_HALF_FLOAT,
 	[VA_BYTE]   = GL_BYTE,
 	[VA_UBYTE]  = GL_UNSIGNED_BYTE,
 	[VA_SHORT]  = GL_SHORT,

From e3ea71e2feb7ecaa2a19323bca5a03f862815ab8 Mon Sep 17 00:00:00 2001
From: Andrei Alexeyev <akari@taisei-project.org>
Date: Fri, 13 Jan 2023 11:03:52 +0100
Subject: [PATCH 4/4] renderer: use float16 for sprite color, size, and custom
 params

---
 src/renderer/api.h                 | 13 ++++---------
 src/renderer/common/sprite_batch.c | 23 ++++++++++-------------
 src/resource/font.c                | 24 +++++++-----------------
 src/stagedraw.c                    |  2 +-
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/src/renderer/api.h b/src/renderer/api.h
index fa75a4bf6f..e56e439e71 100644
--- a/src/renderer/api.h
+++ b/src/renderer/api.h
@@ -483,15 +483,10 @@ typedef struct SpriteParamsBuffer {
 typedef struct SpriteInstanceAttribs {
 	mat4 mv_transform;
 	mat4 tex_transform;
-
-	union {
-		FloatRect texrect;
-		vec4 texrect_vec4;
-	};
-
-	Color rgba;
-	FloatExtent sprite_size;
-	ShaderCustomParams custom;
+	FloatRect texrect;
+	float16_storage_t sprite_size[2];
+	float16_storage_t rgba[4];
+	float16_storage_t custom[4];
 
 	// offsetof(end_of_fields) == size without padding.
 	char end_of_fields;
diff --git a/src/renderer/common/sprite_batch.c b/src/renderer/common/sprite_batch.c
index 3827094777..e637353b85 100644
--- a/src/renderer/common/sprite_batch.c
+++ b/src/renderer/common/sprite_batch.c
@@ -64,7 +64,7 @@ void _r_sprite_batch_init(void) {
 	size_t sz_vert = sizeof(GenericModelVertex);
 	size_t sz_attr = SIZEOF_SPRITE_ATTRIBS;
 
-	#define VERTEX_OFS(attr)   offsetof(GenericModelVertex,  attr)
+	#define VERTEX_OFS(attr)   offsetof(GenericModelVertex,    attr)
 	#define INSTANCE_OFS(attr) offsetof(SpriteInstanceAttribs, attr)
 
 	VertexAttribFormat fmt[] = {
@@ -83,10 +83,10 @@ void _r_sprite_batch_init(void) {
 		{ { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(tex_transform[1]), 1 },
 		{ { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(tex_transform[2]), 1 },
 		{ { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(tex_transform[3]), 1 },
-		{ { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(rgba),             1 },
+		{ { 4, VA_HALF,  VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(rgba),             1 },
 		{ { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(texrect),          1 },
-		{ { 2, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(sprite_size),      1 },
-		{ { 4, VA_FLOAT, VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(custom),           1 },
+		{ { 2, VA_HALF,  VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(sprite_size),      1 },
+		{ { 4, VA_HALF,  VA_CONVERT_FLOAT, 1 }, sz_attr, INSTANCE_OFS(custom),           1 },
 	};
 
 	#undef VERTEX_OFS
@@ -213,12 +213,9 @@ static void _r_sprite_batch_compute_attribs(
 		glm_translate(attribs.mv_transform, (vec3) { ofs.x / imgdims.w, ofs.y / imgdims.h });
 	}
 
-	if(params->color == NULL) {
-		// XXX: should we use r_color_current here?
-		attribs.rgba = *RGBA(1, 1, 1, 1);
-	} else {
-		attribs.rgba = *params->color;
-	}
+	// XXX: should we default to r_color_current here?
+	const Color *color = params->color ?: RGBA(1, 1, 1, 1);
+	f32v4_to_f16v4(attribs.rgba, color->rgba);
 
 	attribs.texrect = spr->tex_area;
 
@@ -232,12 +229,12 @@ static void _r_sprite_batch_compute_attribs(
 		attribs.texrect.h *= -1;
 	}
 
-	attribs.sprite_size = spr->extent;
+	f32v2_to_f16v2(attribs.sprite_size, spr->extent.as_array);
 
 	if(params->shader_params == NULL) {
-		memset(&attribs.custom, 0, sizeof(attribs.custom));
+		memset(attribs.custom, 0, sizeof(attribs.custom));
 	} else {
-		attribs.custom = *params->shader_params;
+		f32v4_to_f16v4(attribs.custom, params->shader_params->vector);
 	}
 
 	*out_attribs = attribs;
diff --git a/src/resource/font.c b/src/resource/font.c
index cb1211c5e1..a6394ff7eb 100644
--- a/src/resource/font.c
+++ b/src/resource/font.c
@@ -1047,21 +1047,14 @@ static double _text_ucs4_draw(Font *font, const uint32_t *ucs4text, const TextPa
 
 	text_ucs4_bbox(font, ucs4text, 0, &bbox);
 
-	Color color;
-
-	if(params->color == NULL) {
-		// XXX: sprite batch code defaults this to RGB(1, 1, 1)
-		color = *r_color_current();
-	} else {
-		color = *params->color;
-	}
-
-	ShaderCustomParams shader_params;
+	SpriteInstanceAttribs init_attribs = {};
+	// XXX: sprite batch code defaults this to RGBA(1, 1, 1, 1)
+	f32v4_to_f16v4(init_attribs.rgba, (params->color ?: r_color_current())->rgba);
 
 	if(params->shader_params == NULL) {
-		memset(&shader_params, 0, sizeof(shader_params));
+		memset(init_attribs.custom, 0, sizeof(init_attribs.custom));
 	} else {
-		shader_params = *params->shader_params;
+		f32v4_to_f16v4(init_attribs.custom, params->shader_params->vector);
 	}
 
 	mat4 mat_texture;
@@ -1131,9 +1124,7 @@ static double _text_ucs4_draw(Font *font, const uint32_t *ucs4text, const TextPa
 			Sprite *spr = &glyph->sprite;
 			set_batch_texture(&batch_state_params, spr->tex);
 
-			SpriteInstanceAttribs attribs;
-			attribs.rgba = color;
-			attribs.custom = shader_params;
+			SpriteInstanceAttribs attribs = init_attribs;
 
 			float g_x = x + glyph->metrics.bearing_x + spr->w * 0.5;
 			float g_y = y - glyph->metrics.bearing_y + spr->h * 0.5 - font->metrics.descent;
@@ -1147,8 +1138,7 @@ static double _text_ucs4_draw(Font *font, const uint32_t *ucs4text, const TextPa
 			attribs.texrect = spr->tex_area;
 
 			// NOTE: Glyphs have their sprite w/h unadjusted for scale.
-			attribs.sprite_size.w = spr->w * iscale;
-			attribs.sprite_size.h = spr->h * iscale;
+			f32v2_to_f16v2(attribs.sprite_size, (float[2]) { spr->w * iscale, spr->h * iscale });
 
 			if(params->glyph_callback.func != NULL) {
 				params->glyph_callback.func(font, uchar, &attribs, params->glyph_callback.userdata);
diff --git a/src/stagedraw.c b/src/stagedraw.c
index 19aea1d3f8..1d6d872d37 100644
--- a/src/stagedraw.c
+++ b/src/stagedraw.c
@@ -1092,7 +1092,7 @@ static int draw_numeric_callback(Font *font, charcode_t charcode, SpriteInstance
 		st->color1 = st->color2;
 	}
 
-	spr_attribs->rgba = *st->color1;
+	f32v4_to_f16v4(spr_attribs->rgba, st->color1->rgba);
 	return 0;
 }