Skip to content

Commit

Permalink
Improve libunicode and libregexp headers (bellard#288)
Browse files Browse the repository at this point in the history
- move all `lre_xxx` functions to libunicode
- use flags table `lre_ctype_bits` instead of bitmaps
- simplify `lre_is_space`, `lre_js_is_ident_first` and `lre_js_is_ident_next`
- simplify `simple_next_token`, handle UTF-8 correctly
- simplify `is_let`, remove dead code
  • Loading branch information
chqrlie authored May 5, 2024
1 parent 1402478 commit 7a2c6f4
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 134 deletions.
29 changes: 2 additions & 27 deletions libregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include "cutils.h"
#include "libregexp.h"
#include "libunicode.h"

/*
TODO:
Expand Down Expand Up @@ -141,32 +142,6 @@ static const uint16_t char_range_s[] = {
0xFEFF, 0xFEFF + 1,
};

BOOL lre_is_space(int c)
{
int i, n, low, high;
n = (countof(char_range_s) - 1) / 2;
for(i = 0; i < n; i++) {
low = char_range_s[2 * i + 1];
if (c < low)
return FALSE;
high = char_range_s[2 * i + 2];
if (c < high)
return TRUE;
}
return FALSE;
}

uint32_t const lre_id_start_table_ascii[4] = {
/* $ A-Z _ a-z */
0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
};

uint32_t const lre_id_continue_table_ascii[4] = {
/* $ 0-9 A-Z _ a-z */
0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
};


static const uint16_t char_range_w[] = {
4,
0x0030, 0x0039 + 1,
Expand All @@ -186,7 +161,7 @@ typedef enum {
CHAR_RANGE_W,
} CharRangeEnum;

static const uint16_t *char_range_table[] = {
static const uint16_t * const char_range_table[] = {
char_range_d,
char_range_s,
char_range_w,
Expand Down
43 changes: 3 additions & 40 deletions libregexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
#define LIBREGEXP_H

#include <stddef.h>

#include "libunicode.h"

#define LRE_BOOL int /* for documentation purposes */
#include <stdint.h>

#define LRE_FLAG_GLOBAL (1 << 0)
#define LRE_FLAG_IGNORECASE (1 << 1)
Expand All @@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture,
int cbuf_type, void *opaque);

int lre_parse_escape(const uint8_t **pp, int allow_utf16);
LRE_BOOL lre_is_space(int c);

/* must be provided by the user */
LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size);
/* must be provided by the user, return non zero if overflow */
int lre_check_stack_overflow(void *opaque, size_t alloca_size);
void *lre_realloc(void *opaque, void *ptr, size_t size);

/* JS identifier test */
extern uint32_t const lre_id_start_table_ascii[4];
extern uint32_t const lre_id_continue_table_ascii[4];

static inline int lre_js_is_ident_first(int c)
{
if ((uint32_t)c < 128) {
return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1;
} else {
#ifdef CONFIG_ALL_UNICODE
return lre_is_id_start(c);
#else
return !lre_is_space(c);
#endif
}
}

static inline int lre_js_is_ident_next(int c)
{
if ((uint32_t)c < 128) {
return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1;
} else {
/* ZWNJ and ZWJ are accepted in identifiers */
#ifdef CONFIG_ALL_UNICODE
return lre_is_id_continue(c) || c == 0x200C || c == 0x200D;
#else
return !lre_is_space(c) || c == 0x200C || c == 0x200D;
#endif
}
}

#undef LRE_BOOL

#endif /* LIBREGEXP_H */
94 changes: 94 additions & 0 deletions libunicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1814,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name)
}

#endif /* CONFIG_ALL_UNICODE */

/*---- lre codepoint categorizing functions ----*/

#define S UNICODE_C_SPACE
#define D UNICODE_C_DIGIT
#define X UNICODE_C_XDIGIT
#define U UNICODE_C_UPPER
#define L UNICODE_C_LOWER
#define _ UNICODE_C_UNDER
#define d UNICODE_C_DOLLAR

uint8_t const lre_ctype_bits[256] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, S, S, S, S, S, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,

S, 0, 0, 0, d, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
X|D, X|D, 0, 0, 0, 0, 0, 0,

0, X|U, X|U, X|U, X|U, X|U, X|U, U,
U, U, U, U, U, U, U, U,
U, U, U, U, U, U, U, U,
U, U, U, 0, 0, 0, 0, _,

0, X|L, X|L, X|L, X|L, X|L, X|L, L,
L, L, L, L, L, L, L, L,
L, L, L, L, L, L, L, L,
L, L, L, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,

S, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
};

#undef S
#undef D
#undef X
#undef U
#undef L
#undef _
#undef d

/* code point ranges for Zs,Zl or Zp property */
static const uint16_t char_range_s[] = {
10,
0x0009, 0x000D + 1,
0x0020, 0x0020 + 1,
0x00A0, 0x00A0 + 1,
0x1680, 0x1680 + 1,
0x2000, 0x200A + 1,
/* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
/* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
0x2028, 0x2029 + 1,
0x202F, 0x202F + 1,
0x205F, 0x205F + 1,
0x3000, 0x3000 + 1,
/* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
0xFEFF, 0xFEFF + 1,
};

BOOL lre_is_space_non_ascii(uint32_t c)
{
size_t i, n;

n = countof(char_range_s);
for(i = 5; i < n; i += 2) {
uint32_t low = char_range_s[i];
uint32_t high = char_range_s[i + 1];
if (c < low)
return FALSE;
if (c < high)
return TRUE;
}
return FALSE;
}
103 changes: 79 additions & 24 deletions libunicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,13 @@
#ifndef LIBUNICODE_H
#define LIBUNICODE_H

#include <inttypes.h>

#define LRE_BOOL int /* for documentation purposes */
#include <stdint.h>

/* define it to include all the unicode tables (40KB larger) */
#define CONFIG_ALL_UNICODE

#define LRE_CC_RES_LEN_MAX 3

typedef enum {
UNICODE_NFC,
UNICODE_NFD,
UNICODE_NFKC,
UNICODE_NFKD,
} UnicodeNormalizationEnum;

int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
int lre_canonicalize(uint32_t c, LRE_BOOL is_unicode);
LRE_BOOL lre_is_cased(uint32_t c);
LRE_BOOL lre_is_case_ignorable(uint32_t c);

/* char ranges */

typedef struct {
Expand Down Expand Up @@ -102,26 +88,95 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,

int cr_invert(CharRange *cr);

int cr_regexp_canonicalize(CharRange *cr, LRE_BOOL is_unicode);

#ifdef CONFIG_ALL_UNICODE
int cr_regexp_canonicalize(CharRange *cr, int is_unicode);

LRE_BOOL lre_is_id_start(uint32_t c);
LRE_BOOL lre_is_id_continue(uint32_t c);
typedef enum {
UNICODE_NFC,
UNICODE_NFD,
UNICODE_NFKC,
UNICODE_NFKD,
} UnicodeNormalizationEnum;

int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
UnicodeNormalizationEnum n_type,
void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));

/* Unicode character range functions */

int unicode_script(CharRange *cr,
const char *script_name, LRE_BOOL is_ext);
int unicode_script(CharRange *cr, const char *script_name, int is_ext);
int unicode_general_category(CharRange *cr, const char *gc_name);
int unicode_prop(CharRange *cr, const char *prop_name);

#endif /* CONFIG_ALL_UNICODE */
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
int lre_canonicalize(uint32_t c, int is_unicode);

/* Code point type categories */
enum {
UNICODE_C_SPACE = (1 << 0),
UNICODE_C_DIGIT = (1 << 1),
UNICODE_C_UPPER = (1 << 2),
UNICODE_C_LOWER = (1 << 3),
UNICODE_C_UNDER = (1 << 4),
UNICODE_C_DOLLAR = (1 << 5),
UNICODE_C_XDIGIT = (1 << 6),
};
extern uint8_t const lre_ctype_bits[256];

/* zero or non-zero return value */
int lre_is_cased(uint32_t c);
int lre_is_case_ignorable(uint32_t c);
int lre_is_id_start(uint32_t c);
int lre_is_id_continue(uint32_t c);

static inline int lre_is_space_byte(uint8_t c) {
return lre_ctype_bits[c] & UNICODE_C_SPACE;
}

static inline int lre_is_id_start_byte(uint8_t c) {
return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
UNICODE_C_UNDER | UNICODE_C_DOLLAR);
}

#undef LRE_BOOL
static inline int lre_is_id_continue_byte(uint8_t c) {
return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
UNICODE_C_UNDER | UNICODE_C_DOLLAR |
UNICODE_C_DIGIT);
}

int lre_is_space_non_ascii(uint32_t c);

static inline int lre_is_space(uint32_t c) {
if (c < 256)
return lre_is_space_byte(c);
else
return lre_is_space_non_ascii(c);
}

static inline int lre_js_is_ident_first(uint32_t c) {
if (c < 128) {
return lre_is_id_start_byte(c);
} else {
#ifdef CONFIG_ALL_UNICODE
return lre_is_id_start(c);
#else
return !lre_is_space_non_ascii(c);
#endif
}
}

static inline int lre_js_is_ident_next(uint32_t c) {
if (c < 128) {
return lre_is_id_continue_byte(c);
} else {
/* ZWNJ and ZWJ are accepted in identifiers */
if (c >= 0x200C && c <= 0x200D)
return TRUE;
#ifdef CONFIG_ALL_UNICODE
return lre_is_id_continue(c);
#else
return !lre_is_space_non_ascii(c);
#endif
}
}

#endif /* LIBUNICODE_H */
Loading

0 comments on commit 7a2c6f4

Please sign in to comment.