Improve libunicode and libregexp headers (bellard#288)

- move all `lre_xxx` functions to libunicode - use flags table `lre_ctype_bits` instead of bitmaps - simplify `lre_is_space`, `lre_js_is_ident_first` and `lre_js_is_ident_next` - simplify `simple_next_token`, handle UTF-8 correctly - simplify `is_let`, remove dead code
chqrlie · May 5, 2024 · 7a2c6f4 · 7a2c6f4
1 parent 1402478
commit 7a2c6f4
Show file tree

Hide file tree

Showing 6 changed files with 243 additions and 134 deletions.
diff --git a/libregexp.c b/libregexp.c
@@ -30,6 +30,7 @@
 
 #include "cutils.h"
 #include "libregexp.h"
+#include "libunicode.h"
 
 /*
   TODO:
@@ -141,32 +142,6 @@ static const uint16_t char_range_s[] = {
     0xFEFF, 0xFEFF + 1,
 };
 
-BOOL lre_is_space(int c)
-{
-    int i, n, low, high;
-    n = (countof(char_range_s) - 1) / 2;
-    for(i = 0; i < n; i++) {
-        low = char_range_s[2 * i + 1];
-        if (c < low)
-            return FALSE;
-        high = char_range_s[2 * i + 2];
-        if (c < high)
-            return TRUE;
-    }
-    return FALSE;
-}
-
-uint32_t const lre_id_start_table_ascii[4] = {
-    /* $ A-Z _ a-z */
-    0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
-};
-
-uint32_t const lre_id_continue_table_ascii[4] = {
-    /* $ 0-9 A-Z _ a-z */
-    0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
-};
-
-
 static const uint16_t char_range_w[] = {
     4,
     0x0030, 0x0039 + 1,
@@ -186,7 +161,7 @@ typedef enum {
     CHAR_RANGE_W,
 } CharRangeEnum;
 
-static const uint16_t *char_range_table[] = {
+static const uint16_t * const char_range_table[] = {
     char_range_d,
     char_range_s,
     char_range_w,

diff --git a/libregexp.h b/libregexp.h
@@ -25,10 +25,7 @@
 #define LIBREGEXP_H
 
 #include <stddef.h>
-
-#include "libunicode.h"
-
-#define LRE_BOOL  int       /* for documentation purposes */
+#include <stdint.h>
 
 #define LRE_FLAG_GLOBAL     (1 << 0)
 #define LRE_FLAG_IGNORECASE (1 << 1)
@@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture,
              int cbuf_type, void *opaque);
 
 int lre_parse_escape(const uint8_t **pp, int allow_utf16);
-LRE_BOOL lre_is_space(int c);
 
-/* must be provided by the user */
-LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size);
+/* must be provided by the user, return non zero if overflow */
+int lre_check_stack_overflow(void *opaque, size_t alloca_size);
 void *lre_realloc(void *opaque, void *ptr, size_t size);
 
-/* JS identifier test */
-extern uint32_t const lre_id_start_table_ascii[4];
-extern uint32_t const lre_id_continue_table_ascii[4];
-
-static inline int lre_js_is_ident_first(int c)
-{
-    if ((uint32_t)c < 128) {
-        return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1;
-    } else {
-#ifdef CONFIG_ALL_UNICODE
-        return lre_is_id_start(c);
-#else
-        return !lre_is_space(c);
-#endif
-    }
-}
-
-static inline int lre_js_is_ident_next(int c)
-{
-    if ((uint32_t)c < 128) {
-        return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1;
-    } else {
-        /* ZWNJ and ZWJ are accepted in identifiers */
-#ifdef CONFIG_ALL_UNICODE
-        return lre_is_id_continue(c) || c == 0x200C || c == 0x200D;
-#else
-        return !lre_is_space(c) || c == 0x200C || c == 0x200D;
-#endif
-    }
-}
-
-#undef LRE_BOOL
-
 #endif /* LIBREGEXP_H */
diff --git a/libunicode.c b/libunicode.c
@@ -1814,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name)
 }
 
 #endif /* CONFIG_ALL_UNICODE */
+
+/*---- lre codepoint categorizing functions ----*/
+
+#define S  UNICODE_C_SPACE
+#define D  UNICODE_C_DIGIT
+#define X  UNICODE_C_XDIGIT
+#define U  UNICODE_C_UPPER
+#define L  UNICODE_C_LOWER
+#define _  UNICODE_C_UNDER
+#define d  UNICODE_C_DOLLAR
+
+uint8_t const lre_ctype_bits[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, S, S, S, S, S, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    S, 0, 0, 0, d, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
+    X|D, X|D, 0, 0, 0, 0, 0, 0,
+
+    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
+    U, U, U, U, U, U, U, U,
+    U, U, U, U, U, U, U, U,
+    U, U, U, 0, 0, 0, 0, _,
+
+    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
+    L, L, L, L, L, L, L, L,
+    L, L, L, L, L, L, L, L,
+    L, L, L, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    S, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+#undef S
+#undef D
+#undef X
+#undef U
+#undef L
+#undef _
+#undef d
+
+/* code point ranges for Zs,Zl or Zp property */
+static const uint16_t char_range_s[] = {
+    10,
+    0x0009, 0x000D + 1,
+    0x0020, 0x0020 + 1,
+    0x00A0, 0x00A0 + 1,
+    0x1680, 0x1680 + 1,
+    0x2000, 0x200A + 1,
+    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
+    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
+    0x2028, 0x2029 + 1,
+    0x202F, 0x202F + 1,
+    0x205F, 0x205F + 1,
+    0x3000, 0x3000 + 1,
+    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
+    0xFEFF, 0xFEFF + 1,
+};
+
+BOOL lre_is_space_non_ascii(uint32_t c)
+{
+    size_t i, n;
+
+    n = countof(char_range_s);
+    for(i = 5; i < n; i += 2) {
+        uint32_t low = char_range_s[i];
+        uint32_t high = char_range_s[i + 1];
+        if (c < low)
+            return FALSE;
+        if (c < high)
+            return TRUE;
+    }
+    return FALSE;
+}
diff --git a/libunicode.h b/libunicode.h
@@ -24,27 +24,13 @@
 #ifndef LIBUNICODE_H
 #define LIBUNICODE_H
 
-#include <inttypes.h>
-
-#define LRE_BOOL  int       /* for documentation purposes */
+#include <stdint.h>
 
 /* define it to include all the unicode tables (40KB larger) */
 #define CONFIG_ALL_UNICODE
 
 #define LRE_CC_RES_LEN_MAX 3
 
-typedef enum {
-    UNICODE_NFC,
-    UNICODE_NFD,
-    UNICODE_NFKC,
-    UNICODE_NFKD,
-} UnicodeNormalizationEnum;
-
-int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
-int lre_canonicalize(uint32_t c, LRE_BOOL is_unicode);
-LRE_BOOL lre_is_cased(uint32_t c);
-LRE_BOOL lre_is_case_ignorable(uint32_t c);
-
 /* char ranges */
 
 typedef struct {
@@ -102,26 +88,95 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
 
 int cr_invert(CharRange *cr);
 
-int cr_regexp_canonicalize(CharRange *cr, LRE_BOOL is_unicode);
-
-#ifdef CONFIG_ALL_UNICODE
+int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
 
-LRE_BOOL lre_is_id_start(uint32_t c);
-LRE_BOOL lre_is_id_continue(uint32_t c);
+typedef enum {
+    UNICODE_NFC,
+    UNICODE_NFD,
+    UNICODE_NFKC,
+    UNICODE_NFKD,
+} UnicodeNormalizationEnum;
 
 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
                       UnicodeNormalizationEnum n_type,
                       void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
 
 /* Unicode character range functions */
 
-int unicode_script(CharRange *cr,
-                   const char *script_name, LRE_BOOL is_ext);
+int unicode_script(CharRange *cr, const char *script_name, int is_ext);
 int unicode_general_category(CharRange *cr, const char *gc_name);
 int unicode_prop(CharRange *cr, const char *prop_name);
 
-#endif /* CONFIG_ALL_UNICODE */
+int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
+int lre_canonicalize(uint32_t c, int is_unicode);
+
+/* Code point type categories */
+enum {
+    UNICODE_C_SPACE  = (1 << 0),
+    UNICODE_C_DIGIT  = (1 << 1),
+    UNICODE_C_UPPER  = (1 << 2),
+    UNICODE_C_LOWER  = (1 << 3),
+    UNICODE_C_UNDER  = (1 << 4),
+    UNICODE_C_DOLLAR = (1 << 5),
+    UNICODE_C_XDIGIT = (1 << 6),
+};
+extern uint8_t const lre_ctype_bits[256];
+
+/* zero or non-zero return value */
+int lre_is_cased(uint32_t c);
+int lre_is_case_ignorable(uint32_t c);
+int lre_is_id_start(uint32_t c);
+int lre_is_id_continue(uint32_t c);
+
+static inline int lre_is_space_byte(uint8_t c) {
+    return lre_ctype_bits[c] & UNICODE_C_SPACE;
+}
+
+static inline int lre_is_id_start_byte(uint8_t c) {
+    return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+                                UNICODE_C_UNDER | UNICODE_C_DOLLAR);
+}
 
-#undef LRE_BOOL
+static inline int lre_is_id_continue_byte(uint8_t c) {
+    return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+                                UNICODE_C_UNDER | UNICODE_C_DOLLAR |
+                                UNICODE_C_DIGIT);
+}
+
+int lre_is_space_non_ascii(uint32_t c);
+
+static inline int lre_is_space(uint32_t c) {
+    if (c < 256)
+        return lre_is_space_byte(c);
+    else
+        return lre_is_space_non_ascii(c);
+}
+
+static inline int lre_js_is_ident_first(uint32_t c) {
+    if (c < 128) {
+        return lre_is_id_start_byte(c);
+    } else {
+#ifdef CONFIG_ALL_UNICODE
+        return lre_is_id_start(c);
+#else
+        return !lre_is_space_non_ascii(c);
+#endif
+    }
+}
+
+static inline int lre_js_is_ident_next(uint32_t c) {
+    if (c < 128) {
+        return lre_is_id_continue_byte(c);
+    } else {
+        /* ZWNJ and ZWJ are accepted in identifiers */
+        if (c >= 0x200C && c <= 0x200D)
+            return TRUE;
+#ifdef CONFIG_ALL_UNICODE
+        return lre_is_id_continue(c);
+#else
+        return !lre_is_space_non_ascii(c);
+#endif
+    }
+}
 
 #endif /* LIBUNICODE_H */