My working unpac space for OCaml projects in development
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

regexp: removed alloca() is lre_exec() - added specific opcodes for \s and \S to have a smaller bytecode - optimized \b and \B

+94 -69
+2
vendor/git/quickjs-c/libregexp-opcode.h
··· 31 31 DEF(char32_i, 5) 32 32 DEF(dot, 1) 33 33 DEF(any, 1) /* same as dot but match any character including line terminator */ 34 + DEF(space, 1) 35 + DEF(not_space, 1) /* must come after */ 34 36 DEF(line_start, 1) 35 37 DEF(line_start_m, 1) 36 38 DEF(line_end, 1)
+76 -60
vendor/git/quickjs-c/libregexp.c
··· 34 34 35 35 /* 36 36 TODO: 37 - 37 + - remove REOP_char_i and REOP_range_i by precomputing the case folding. 38 + - add specific opcodes for simple unicode property tests so that the 39 + generated bytecode is smaller. 38 40 - Add a lock step execution mode (=linear time execution guaranteed) 39 41 when the regular expression is "simple" i.e. no backreference nor 40 42 complicated lookahead. The opcodes are designed for this execution ··· 1078 1080 goto default_escape; 1079 1081 if (cr_init_char_range(s, cr, c)) 1080 1082 return -1; 1081 - c = CLASS_RANGE_BASE; 1083 + c += CLASS_RANGE_BASE; 1082 1084 break; 1083 1085 case 'c': 1084 1086 c = *p; ··· 1584 1586 case REOP_char32_i: 1585 1587 case REOP_dot: 1586 1588 case REOP_any: 1589 + case REOP_space: 1590 + case REOP_not_space: 1587 1591 need_check_adv = FALSE; 1588 1592 break; 1589 1593 case REOP_line_start: ··· 2028 2032 case 'b': 2029 2033 case 'B': 2030 2034 if (p[1] != 'b') { 2031 - re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary); 2035 + re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary); 2032 2036 } else { 2033 - re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary); 2037 + re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary); 2034 2038 } 2035 2039 p += 2; 2036 2040 break; ··· 2167 2171 if (is_backward_dir) 2168 2172 re_emit_op(s, REOP_prev); 2169 2173 if (c >= CLASS_RANGE_BASE) { 2170 - int ret; 2171 - ret = re_emit_string_list(s, cr); 2174 + int ret = 0; 2175 + /* optimize the common 'space' tests */ 2176 + if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) { 2177 + re_emit_op(s, REOP_space); 2178 + } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) { 2179 + re_emit_op(s, REOP_not_space); 2180 + } else { 2181 + ret = re_emit_string_list(s, cr); 2182 + } 2172 2183 re_string_list_free(cr); 2173 2184 if (ret) 2174 2185 return -1; ··· 2607 2618 return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); 2608 2619 } 2609 2620 2610 - static BOOL is_word_char(uint32_t c) 2611 - { 2612 - return ((c >= '0' && c <= '9') || 2613 - (c >= 'a' && c <= 'z') || 2614 - (c >= 'A' && c <= 'Z') || 2615 - (c == '_')); 2616 - } 2617 - 2618 2621 #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ 2619 2622 do { \ 2620 2623 if (cbuf_type == 0) { \ ··· 2769 2772 2770 2773 /* return 1 if match, 0 if not match or < 0 if error. */ 2771 2774 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, 2772 - uint8_t **regs, const uint8_t *pc, const uint8_t *cptr) 2775 + const uint8_t *pc, const uint8_t *cptr) 2773 2776 { 2774 2777 int opcode; 2775 2778 int cbuf_type; ··· 2809 2812 } 2810 2813 2811 2814 /* avoid saving the previous value if already saved */ 2812 - #define SAVE_REG(idx, value) \ 2815 + #define SAVE_CAPTURE_CHECK(idx, value) \ 2813 2816 { \ 2814 2817 StackElem *sp1; \ 2815 2818 sp1 = sp; \ 2816 2819 for(;;) { \ 2817 2820 if (sp1 > bp) { \ 2818 - if (sp1[-2].val == -(int)(idx + 1)) \ 2821 + if (sp1[-2].val == idx) \ 2819 2822 break; \ 2820 2823 sp1 -= 2; \ 2821 2824 } else { \ 2822 2825 CHECK_STACK_SPACE(2); \ 2823 - sp[0].val = -(int)(idx + 1); \ 2824 - sp[1].ptr = regs[idx]; \ 2826 + sp[0].val = idx; \ 2827 + sp[1].ptr = capture[idx]; \ 2825 2828 sp += 2; \ 2826 2829 break; \ 2827 2830 } \ 2828 2831 } \ 2829 - regs[idx] = (value); \ 2832 + capture[idx] = (value); \ 2830 2833 } 2831 2834 2832 2835 ··· 2851 2854 REExecStateEnum type; 2852 2855 if (bp == s->stack_buf) 2853 2856 return 0; 2854 - /* undo the modifications to capture[] and regs[] */ 2857 + /* undo the modifications to capture[] */ 2855 2858 while (sp > bp) { 2856 - intptr_t idx2 = sp[-2].val; 2857 - if (idx2 >= 0) 2858 - capture[idx2] = sp[-1].ptr; 2859 - else 2860 - regs[-idx2 - 1] = sp[-1].ptr; 2859 + capture[sp[-2].val] = sp[-1].ptr; 2861 2860 sp -= 2; 2862 2861 } 2863 2862 ··· 2910 2909 for(;;) { 2911 2910 REExecStateEnum type; 2912 2911 type = bp[-1].bp.type; 2913 - /* undo the modifications to capture[] and regs[] */ 2912 + /* undo the modifications to capture[] */ 2914 2913 while (sp > bp) { 2915 - intptr_t idx2 = sp[-2].val; 2916 - if (idx2 >= 0) 2917 - capture[idx2] = sp[-1].ptr; 2918 - else 2919 - regs[-idx2 - 1] = sp[-1].ptr; 2914 + capture[sp[-2].val] = sp[-1].ptr; 2920 2915 sp -= 2; 2921 2916 } 2922 2917 pc = sp[-3].ptr; ··· 3019 3014 goto no_match; 3020 3015 GET_CHAR(c, cptr, cbuf_end, cbuf_type); 3021 3016 break; 3017 + case REOP_space: 3018 + if (cptr == cbuf_end) 3019 + goto no_match; 3020 + GET_CHAR(c, cptr, cbuf_end, cbuf_type); 3021 + if (!lre_is_space(c)) 3022 + goto no_match; 3023 + break; 3024 + case REOP_not_space: 3025 + if (cptr == cbuf_end) 3026 + goto no_match; 3027 + GET_CHAR(c, cptr, cbuf_end, cbuf_type); 3028 + if (lre_is_space(c)) 3029 + goto no_match; 3030 + break; 3022 3031 case REOP_save_start: 3023 3032 case REOP_save_end: 3024 3033 val = *pc++; ··· 3044 3053 } 3045 3054 break; 3046 3055 case REOP_set_i32: 3047 - idx = pc[0]; 3056 + idx = 2 * s->capture_count + pc[0]; 3048 3057 val = get_u32(pc + 1); 3049 3058 pc += 5; 3050 - SAVE_REG(idx, (void *)(uintptr_t)val); 3059 + SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val); 3051 3060 break; 3052 3061 case REOP_loop: 3053 3062 { 3054 3063 uint32_t val2; 3055 - idx = pc[0]; 3064 + idx = 2 * s->capture_count + pc[0]; 3056 3065 val = get_u32(pc + 1); 3057 3066 pc += 5; 3058 3067 3059 - val2 = (uintptr_t)regs[idx] - 1; 3060 - SAVE_REG(idx, (void *)(uintptr_t)val2); 3068 + val2 = (uintptr_t)capture[idx] - 1; 3069 + SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2); 3061 3070 if (val2 != 0) { 3062 3071 pc += (int)val; 3063 3072 if (lre_poll_timeout(s)) ··· 3072 3081 { 3073 3082 const uint8_t *pc1; 3074 3083 uint32_t val2, limit; 3075 - idx = pc[0]; 3084 + idx = 2 * s->capture_count + pc[0]; 3076 3085 limit = get_u32(pc + 1); 3077 3086 val = get_u32(pc + 5); 3078 3087 pc += 9; 3079 3088 3080 3089 /* decrement the counter */ 3081 - val2 = (uintptr_t)regs[idx] - 1; 3082 - SAVE_REG(idx, (void *)(uintptr_t)val2); 3090 + val2 = (uintptr_t)capture[idx] - 1; 3091 + SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2); 3083 3092 3084 3093 if (val2 > limit) { 3085 3094 /* normal loop if counter > limit */ ··· 3090 3099 /* check advance */ 3091 3100 if ((opcode == REOP_loop_check_adv_split_goto_first || 3092 3101 opcode == REOP_loop_check_adv_split_next_first) && 3093 - regs[idx + 1] == cptr && 3102 + capture[idx + 1] == cptr && 3094 3103 val2 != limit) { 3095 3104 goto no_match; 3096 3105 } ··· 3116 3125 } 3117 3126 break; 3118 3127 case REOP_set_char_pos: 3119 - idx = pc[0]; 3128 + idx = 2 * s->capture_count + pc[0]; 3120 3129 pc++; 3121 - SAVE_REG(idx, (uint8_t *)cptr); 3130 + SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr); 3122 3131 break; 3123 3132 case REOP_check_advance: 3124 - idx = pc[0]; 3133 + idx = 2 * s->capture_count + pc[0]; 3125 3134 pc++; 3126 - if (regs[idx] == cptr) 3135 + if (capture[idx] == cptr) 3127 3136 goto no_match; 3128 3137 break; 3129 3138 case REOP_word_boundary: ··· 3139 3148 v1 = FALSE; 3140 3149 } else { 3141 3150 PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); 3142 - if (ignore_case) 3143 - c = lre_canonicalize(c, s->is_unicode); 3144 - v1 = is_word_char(c); 3151 + if (c < 256) { 3152 + v1 = (lre_is_word_byte(c) != 0); 3153 + } else { 3154 + v1 = ignore_case && (c == 0x017f || c == 0x212a); 3155 + } 3145 3156 } 3146 3157 /* current char */ 3147 3158 if (cptr >= cbuf_end) { 3148 3159 v2 = FALSE; 3149 3160 } else { 3150 3161 PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); 3151 - if (ignore_case) 3152 - c = lre_canonicalize(c, s->is_unicode); 3153 - v2 = is_word_char(c); 3162 + if (c < 256) { 3163 + v2 = (lre_is_word_byte(c) != 0); 3164 + } else { 3165 + v2 = ignore_case && (c == 0x017f || c == 0x212a); 3166 + } 3154 3167 } 3155 3168 if (v1 ^ v2 ^ is_boundary) 3156 3169 goto no_match; ··· 3315 3328 int cbuf_type, void *opaque) 3316 3329 { 3317 3330 REExecContext s_s, *s = &s_s; 3318 - int re_flags, i, ret, register_count; 3319 - uint8_t **regs; 3331 + int re_flags, i, ret; 3320 3332 const uint8_t *cptr; 3321 3333 3322 3334 re_flags = lre_get_flags(bc_buf); ··· 3335 3347 3336 3348 for(i = 0; i < s->capture_count * 2; i++) 3337 3349 capture[i] = NULL; 3338 - /* XXX: modify the API so that the registers are allocated after 3339 - the captures to suppress some tests */ 3340 - register_count = bc_buf[RE_HEADER_REGISTER_COUNT]; 3341 - regs = alloca(register_count * sizeof(regs[0])); 3342 3350 3343 3351 cptr = cbuf + (cindex << cbuf_type); 3344 3352 if (0 < cindex && cindex < clen && s->cbuf_type == 2) { ··· 3348 3356 } 3349 3357 } 3350 3358 3351 - ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN, 3352 - cptr); 3359 + ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr); 3360 + 3353 3361 if (s->stack_buf != s->static_stack_buf) 3354 3362 lre_realloc(s->opaque, s->stack_buf, 0); 3355 3363 return ret; 3356 3364 } 3357 3365 3366 + int lre_get_alloc_count(const uint8_t *bc_buf) 3367 + { 3368 + return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 + 3369 + bc_buf[RE_HEADER_REGISTER_COUNT]; 3370 + } 3371 + 3358 3372 int lre_get_capture_count(const uint8_t *bc_buf) 3359 3373 { 3360 3374 return bc_buf[RE_HEADER_CAPTURE_COUNT]; ··· 3393 3407 int len, flags, ret, i; 3394 3408 uint8_t *bc; 3395 3409 char error_msg[64]; 3396 - uint8_t *capture[CAPTURE_COUNT_MAX * 2]; 3410 + uint8_t *capture; 3397 3411 const char *input; 3398 3412 int input_len, capture_count; 3399 3413 ··· 3412 3426 input = argv[3]; 3413 3427 input_len = strlen(input); 3414 3428 3429 + capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc)); 3415 3430 ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); 3416 3431 printf("ret=%d\n", ret); 3417 3432 if (ret == 1) { ··· 3427 3442 printf("\n"); 3428 3443 } 3429 3444 } 3445 + free(capture); 3430 3446 return 0; 3431 3447 } 3432 3448 #endif
+1
vendor/git/quickjs-c/libregexp.h
··· 46 46 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, 47 47 const char *buf, size_t buf_len, int re_flags, 48 48 void *opaque); 49 + int lre_get_alloc_count(const uint8_t *bc_buf); 49 50 int lre_get_capture_count(const uint8_t *bc_buf); 50 51 int lre_get_flags(const uint8_t *bc_buf); 51 52 const char *lre_get_groupnames(const uint8_t *bc_buf);
+5
vendor/git/quickjs-c/libunicode.h
··· 147 147 UNICODE_C_DIGIT); 148 148 } 149 149 150 + static inline int lre_is_word_byte(uint8_t c) { 151 + return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | 152 + UNICODE_C_UNDER | UNICODE_C_DIGIT); 153 + } 154 + 150 155 int lre_is_space_non_ascii(uint32_t c); 151 156 152 157 static inline int lre_is_space(uint32_t c) {
+10 -9
vendor/git/quickjs-c/quickjs.c
··· 45487 45487 goto add_tail; 45488 45488 goto done; 45489 45489 } 45490 - q = p; 45491 45490 for (q = p; (q += !r) <= s - r - !r; q = p = e + r) { 45492 45491 e = string_indexof(sp, rp, q); 45493 45492 if (e < 0) ··· 47423 47422 JSValue indices, indices_groups; 47424 47423 uint8_t *re_bytecode; 47425 47424 uint8_t **capture, *str_buf; 47426 - int rc, capture_count, shift, i, re_flags; 47425 + int rc, capture_count, shift, i, re_flags, alloc_count; 47427 47426 int64_t last_index; 47428 47427 const char *group_name_ptr; 47429 47428 JSObject *p_obj; ··· 47453 47452 last_index = 0; 47454 47453 } 47455 47454 str = JS_VALUE_GET_STRING(str_val); 47456 - capture_count = lre_get_capture_count(re_bytecode); 47457 - if (capture_count > 0) { 47458 - capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2); 47455 + alloc_count = lre_get_alloc_count(re_bytecode); 47456 + if (alloc_count > 0) { 47457 + capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count); 47459 47458 if (!capture) 47460 47459 goto fail; 47461 47460 } 47461 + capture_count = lre_get_capture_count(re_bytecode); 47462 47462 shift = str->is_wide_char; 47463 47463 str_buf = str->u.str8; 47464 47464 if (last_index > str->len) { ··· 47642 47642 uint8_t *re_bytecode; 47643 47643 int ret; 47644 47644 uint8_t **capture, *str_buf; 47645 - int capture_count, shift, re_flags; 47645 + int capture_count, alloc_count, shift, re_flags; 47646 47646 int next_src_pos, start, end; 47647 47647 int64_t last_index; 47648 47648 StringBuffer b_s, *b = &b_s; ··· 47676 47676 if (js_regexp_get_lastIndex(ctx, &last_index, this_val)) 47677 47677 goto fail; 47678 47678 } 47679 - capture_count = lre_get_capture_count(re_bytecode); 47680 - if (capture_count > 0) { 47681 - capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2); 47679 + alloc_count = lre_get_alloc_count(re_bytecode); 47680 + if (alloc_count > 0) { 47681 + capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count); 47682 47682 if (!capture) 47683 47683 goto fail; 47684 47684 } 47685 + capture_count = lre_get_capture_count(re_bytecode); 47685 47686 fullUnicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0); 47686 47687 shift = str->is_wide_char; 47687 47688 str_buf = str->u.str8;