My working unpac space for OCaml projects in development
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

regexp: ensure that the bytecode size grows linearly with respect to the input regexp.

This way, pathological regexps such as
/(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(?:a|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+/ are no longer an issue. The generated bytecode is also simpler and
faster.

+111 -41
+4
vendor/git/quickjs-c/libregexp-opcode.h
··· 45 45 DEF(save_end, 2) /* save end position, must come after saved_start */ 46 46 DEF(save_reset, 3) /* reset save positions */ 47 47 DEF(loop, 6) /* decrement the top the stack and goto if != 0 */ 48 + DEF(loop_split_goto_first, 10) 49 + DEF(loop_split_next_first, 10) 50 + DEF(loop_check_adv_split_goto_first, 10) 51 + DEF(loop_check_adv_split_next_first, 10) 48 52 DEF(push_i32, 6) /* push integer on the stack */ 49 53 DEF(word_boundary, 1) 50 54 DEF(word_boundary_i, 1)
+107 -41
vendor/git/quickjs-c/libregexp.c
··· 532 532 val += (pos + 6); 533 533 printf(" %u, %u", val2, val); 534 534 break; 535 + case REOP_loop_split_goto_first: 536 + case REOP_loop_split_next_first: 537 + case REOP_loop_check_adv_split_goto_first: 538 + case REOP_loop_check_adv_split_next_first: 539 + { 540 + uint32_t limit; 541 + val2 = buf[pos + 1]; 542 + limit = get_u32(buf + pos + 2); 543 + val = get_u32(buf + pos + 6); 544 + val += (pos + 10); 545 + printf(" %u, %u, %u", val2, limit, val); 546 + } 547 + break; 535 548 case REOP_save_start: 536 549 case REOP_save_end: 537 550 case REOP_back_reference: ··· 615 628 int pos; 616 629 dbuf_putc(&s->byte_code, op); 617 630 dbuf_putc(&s->byte_code, arg); 631 + pos = s->byte_code.size; 632 + dbuf_put_u32(&s->byte_code, val - (pos + 4)); 633 + return pos; 634 + } 635 + 636 + static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val) 637 + { 638 + int pos; 639 + dbuf_putc(&s->byte_code, op); 640 + dbuf_putc(&s->byte_code, arg0); 641 + dbuf_put_u32(&s->byte_code, arg1); 618 642 pos = s->byte_code.size; 619 643 dbuf_put_u32(&s->byte_code, val - (pos + 4)); 620 644 return pos; ··· 2183 2207 if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2)) 2184 2208 goto out_of_memory; 2185 2209 pos = last_atom_start; 2210 + s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; 2211 + put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10); 2212 + pos += 4; 2213 + 2186 2214 s->byte_code.buf[pos++] = REOP_push_i32; 2187 2215 s->byte_code.buf[pos++] = 0; 2188 2216 put_u32(s->byte_code.buf + pos, quant_max); 2189 2217 pos += 4; 2190 - 2191 - s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; 2192 - put_u32(s->byte_code.buf + pos, len + 6 + add_zero_advance_check * 2 * 2); 2193 - pos += 4; 2218 + last_atom_start = pos; 2194 2219 if (add_zero_advance_check) { 2195 2220 s->byte_code.buf[pos++] = REOP_push_char_pos; 2196 2221 s->byte_code.buf[pos++] = 0; 2197 - re_emit_op_u8(s, REOP_check_advance, 0); 2198 2222 } 2199 - re_emit_goto_u8(s, REOP_loop, 0, last_atom_start + 6); 2223 + re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start); 2200 2224 } 2201 2225 } else if (quant_min == 1 && quant_max == INT32_MAX && 2202 2226 !add_zero_advance_check) { 2203 2227 re_emit_goto(s, REOP_split_next_first - greedy, 2204 2228 last_atom_start); 2205 2229 } else { 2206 - if (quant_min == 1) { 2207 - /* nothing to add */ 2208 - } else { 2209 - if (dbuf_insert(&s->byte_code, last_atom_start, 6)) 2210 - goto out_of_memory; 2211 - s->byte_code.buf[last_atom_start++] = REOP_push_i32; 2212 - s->byte_code.buf[last_atom_start++] = 0; 2213 - put_u32(s->byte_code.buf + last_atom_start, quant_min); 2214 - last_atom_start += 4; 2215 - re_emit_goto_u8(s, REOP_loop, 0, last_atom_start); 2230 + if (quant_min == quant_max) 2231 + add_zero_advance_check = FALSE; 2232 + if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2)) 2233 + goto out_of_memory; 2234 + /* Note: we assume the string length is < INT32_MAX */ 2235 + pos = last_atom_start; 2236 + s->byte_code.buf[pos++] = REOP_push_i32; 2237 + s->byte_code.buf[pos++] = 0; 2238 + put_u32(s->byte_code.buf + pos, quant_max); 2239 + pos += 4; 2240 + last_atom_start = pos; 2241 + if (add_zero_advance_check) { 2242 + s->byte_code.buf[pos++] = REOP_push_char_pos; 2243 + s->byte_code.buf[pos++] = 0; 2216 2244 } 2217 - if (quant_max == INT32_MAX) { 2218 - pos = s->byte_code.size; 2219 - re_emit_op_u32(s, REOP_split_goto_first + greedy, 2220 - len + 5 + add_zero_advance_check * 2 * 2); 2221 - if (add_zero_advance_check) 2222 - re_emit_op_u8(s, REOP_push_char_pos, 0); 2223 - /* copy the atom */ 2224 - dbuf_put_self(&s->byte_code, last_atom_start, len); 2225 - if (add_zero_advance_check) 2226 - re_emit_op_u8(s, REOP_check_advance, 0); 2227 - re_emit_goto(s, REOP_goto, pos); 2228 - } else if (quant_max > quant_min) { 2229 - re_emit_op_u8(s, REOP_push_i32, 0); 2230 - dbuf_put_u32(&s->byte_code, quant_max - quant_min); 2231 - 2232 - pos = s->byte_code.size; 2233 - re_emit_op_u32(s, REOP_split_goto_first + greedy, 2234 - len + 6 + add_zero_advance_check * 2 * 2); 2235 - if (add_zero_advance_check) 2236 - re_emit_op_u8(s, REOP_push_char_pos, 0); 2237 - /* copy the atom */ 2238 - dbuf_put_self(&s->byte_code, last_atom_start, len); 2239 - if (add_zero_advance_check) 2240 - re_emit_op_u8(s, REOP_check_advance, 0); 2241 - re_emit_goto_u8(s, REOP_loop, 0, pos); 2245 + if (quant_min == quant_max) { 2246 + /* a simple loop is enough */ 2247 + re_emit_goto_u8(s, REOP_loop, 0, last_atom_start); 2248 + } else { 2249 + re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start); 2242 2250 } 2243 2251 } 2244 2252 last_atom_start = -1; ··· 2352 2360 break; 2353 2361 case REOP_check_advance: 2354 2362 case REOP_loop: 2363 + case REOP_loop_split_goto_first: 2364 + case REOP_loop_split_next_first: 2355 2365 assert(stack_size > 0); 2356 2366 stack_size--; 2367 + bc_buf[pos + 1] = stack_size; 2368 + break; 2369 + case REOP_loop_check_adv_split_goto_first: 2370 + case REOP_loop_check_adv_split_next_first: 2371 + assert(stack_size >= 2); 2372 + stack_size -= 2; 2357 2373 bc_buf[pos + 1] = stack_size; 2358 2374 break; 2359 2375 case REOP_range: ··· 2953 2969 pc += (int)val; 2954 2970 if (lre_poll_timeout(s)) 2955 2971 return LRE_RET_TIMEOUT; 2972 + } 2973 + } 2974 + break; 2975 + case REOP_loop_split_goto_first: 2976 + case REOP_loop_split_next_first: 2977 + case REOP_loop_check_adv_split_goto_first: 2978 + case REOP_loop_check_adv_split_next_first: 2979 + { 2980 + const uint8_t *pc1; 2981 + uint32_t val2, limit; 2982 + idx = pc[0]; 2983 + limit = get_u32(pc + 1); 2984 + val = get_u32(pc + 5); 2985 + pc += 9; 2986 + 2987 + /* decrement the counter */ 2988 + val2 = (uintptr_t)aux_stack[idx] - 1; 2989 + SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2); 2990 + 2991 + if (val2 > limit) { 2992 + /* normal loop if counter > limit */ 2993 + pc += (int)val; 2994 + if (lre_poll_timeout(s)) 2995 + return LRE_RET_TIMEOUT; 2996 + } else { 2997 + /* check advance */ 2998 + if ((opcode == REOP_loop_check_adv_split_goto_first || 2999 + opcode == REOP_loop_check_adv_split_next_first) && 3000 + aux_stack[idx + 1] == cptr && 3001 + val2 != limit) { 3002 + goto no_match; 3003 + } 3004 + 3005 + /* otherwise conditional split */ 3006 + if (val2 != 0) { 3007 + if (opcode == REOP_loop_split_next_first || 3008 + opcode == REOP_loop_check_adv_split_next_first) { 3009 + pc1 = pc + (int)val; 3010 + } else { 3011 + pc1 = pc; 3012 + pc = pc + (int)val; 3013 + } 3014 + CHECK_STACK_SPACE(3); 3015 + sp[0].ptr = (uint8_t *)pc1; 3016 + sp[1].ptr = (uint8_t *)cptr; 3017 + sp[2].bp.val = bp - s->stack_buf; 3018 + sp[2].bp.type = RE_EXEC_STATE_SPLIT; 3019 + sp += 3; 3020 + bp = sp; 3021 + } 2956 3022 } 2957 3023 } 2958 3024 break;