Skip to content

Commit

Permalink
Add a fast path for ASCII strings
Browse files Browse the repository at this point in the history
This optimization is based on a few assumptions:

  - Most strings are ASCII only.
  - Most strings had their coderange scanned already.

If the above is true, then by checking the string coderange, we can
use a much more streamlined function to encode ASCII strings.

Before:

```
== Encoding twitter.json (466906 bytes)
ruby 3.4.0preview2 (2024-10-07 master 32c733f57b) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
                json   140.000 i/100ms
                  oj   230.000 i/100ms
           rapidjson   108.000 i/100ms
Calculating -------------------------------------
                json      1.464k (± 1.4%) i/s  (682.83 μs/i) -      7.420k in   5.067573s
                  oj      2.338k (± 1.5%) i/s  (427.64 μs/i) -     11.730k in   5.017336s
           rapidjson      1.075k (± 1.6%) i/s  (930.40 μs/i) -      5.400k in   5.025469s

Comparison:
                json:     1464.5 i/s
                  oj:     2338.4 i/s - 1.60x  faster
           rapidjson:     1074.8 i/s - 1.36x  slower

```

After:

```
== Encoding twitter.json (466906 bytes)
ruby 3.4.0preview2 (2024-10-07 master 32c733f57b) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
                json   189.000 i/100ms
                  oj   228.000 i/100ms
           rapidjson   108.000 i/100ms
Calculating -------------------------------------
                json      1.903k (± 1.2%) i/s  (525.55 μs/i) -      9.639k in   5.066521s
                  oj      2.306k (± 1.3%) i/s  (433.71 μs/i) -     11.628k in   5.044096s
           rapidjson      1.069k (± 2.4%) i/s  (935.38 μs/i) -      5.400k in   5.053794s

Comparison:
                json:     1902.8 i/s
                  oj:     2305.7 i/s - 1.21x  faster
           rapidjson:     1069.1 i/s - 1.78x  slower
```
  • Loading branch information
byroot committed Oct 17, 2024
1 parent 92a2d54 commit 2aefa41
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 31 deletions.
215 changes: 185 additions & 30 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *in_utf8_str = RSTRING_PTR(in_string);
unsigned long in_utf8_len = RSTRING_LEN(in_string);
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);

unsigned long beg = 0, pos;

Expand All @@ -42,46 +41,183 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_
bool should_escape;

/* UTF-8 decoding */
if (in_is_ascii_only) {
ch = in_utf8_str[pos];
ch_len = 1;
} else {
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
if ((pos+ch_len) > in_utf8_len)
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
for (i = 1; i < ch_len; i++) {
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else {
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
}

for (i = 1; i < ch_len; i++) {
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
}

/* JSON policy */
should_escape =
(ch < 0x20) ||
(ch == '"') ||
(ch == '\\') ||
(out_script_safe && (ch == '/')) ||
(out_script_safe && (ch == 0x2028)) ||
(out_script_safe && (ch == 0x2029));

/* JSON encoding */
if (should_escape) {
if (pos > beg) {
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
}

beg = pos + ch_len;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
if (ch <= 0xFFFF) {
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
} else {
uint16_t hi, lo;
ch -= 0x10000;
hi = 0xD800 + (uint16_t)(ch >> 10);
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);

scratch[2] = hexdig[hi >> 12];
scratch[3] = hexdig[(hi >> 8) & 0xf];
scratch[4] = hexdig[(hi >> 4) & 0xf];
scratch[5] = hexdig[hi & 0xf];

scratch[8] = hexdig[lo >> 12];
scratch[9] = hexdig[(lo >> 8) & 0xf];
scratch[10] = hexdig[(lo >> 4) & 0xf];
scratch[11] = hexdig[lo & 0xf];

fbuffer_append(out_buffer, scratch, 12);
}
}
if (ch > 0x10FFFF)
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}

pos += ch_len;
}

if (beg < in_utf8_len) {
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
}

RB_GC_GUARD(in_string);
}

static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

unsigned long beg = 0, pos;

for (pos = 0; pos < len;) {
unsigned char ch = ptr[pos];
bool should_escape;

/* JSON policy */
should_escape =
(ch < 0x20) ||
(ch == '"') ||
(ch == '\\') ||
(out_ascii_only && (ch > 0x7F)) ||
(out_script_safe && (ch == '/'));

/* JSON encoding */
if (should_escape) {
if (pos > beg) {
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
}

beg = pos + 1;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
}
}

pos++;
}

if (beg < len) {
fbuffer_append(out_buffer, &ptr[beg], len - beg);
}

RB_GC_GUARD(str);
}

static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *in_utf8_str = RSTRING_PTR(in_string);
unsigned long in_utf8_len = RSTRING_LEN(in_string);

unsigned long beg = 0, pos;

for (pos = 0; pos < in_utf8_len;) {
uint32_t ch;
short ch_len;
bool should_escape;

/* UTF-8 decoding */
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else {
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
}

for (i = 1; i < ch_len; i++) {
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
}

/* JSON policy */
should_escape =
(ch < 0x20) ||
(ch == '"') ||
(ch == '\\') ||
(ch > 0x7F) ||
(out_script_safe && (ch == '/')) ||
(out_script_safe && (ch == 0x2028)) ||
(out_script_safe && (ch == 0x2029));

/* JSON encoding */
if (should_escape) {
if (pos > beg)
if (pos > beg) {
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
}

beg = pos + ch_len;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
Expand Down Expand Up @@ -122,8 +258,11 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_

pos += ch_len;
}
if (beg < in_utf8_len)

if (beg < in_utf8_len) {
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
}

RB_GC_GUARD(in_string);
}

Expand Down Expand Up @@ -570,11 +709,27 @@ static int enc_utf8_compatible_p(int enc_idx)

static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj)
{
fbuffer_append_char(buffer, '"');
if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) {
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
}
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);

fbuffer_append_char(buffer, '"');

switch(rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
convert_ASCII_to_JSON(buffer, obj, state->script_safe);
break;
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe);
} else {
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
}
break;
default:
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
break;
}
fbuffer_append_char(buffer, '"');
}

Expand Down
1 change: 0 additions & 1 deletion ext/json/ext/generator/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ typedef unsigned char _Bool;
#endif
#endif

static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe);
static char *fstrndup(const char *ptr, unsigned long len);

/* ruby api and some helpers */
Expand Down

0 comments on commit 2aefa41

Please sign in to comment.