From b1f07805c342079aa711fc33a63ec49f3e0567e5 Mon Sep 17 00:00:00 2001 From: Haled Odat <8566042+HalidOdat@users.noreply.github.com> Date: Wed, 3 Apr 2024 00:10:02 +0200 Subject: [PATCH] Optimize Regex match check (#3779) * Initial optimization * Fix sticky flag * Apply review * Update regress to 0.9.1 * Add spec deviation comments --------- Co-authored-by: raskad <32105367+raskad@users.noreply.github.com> --- Cargo.lock | 4 +- Cargo.toml | 2 +- core/engine/src/builtins/regexp/mod.rs | 122 ++++++++++++------------- 3 files changed, 60 insertions(+), 68 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d600803e86..3ef45ed174 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3074,9 +3074,9 @@ dependencies = [ [[package]] name = "regress" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d06f9a1f7cd8473611ba1a480cf35f9c5cffc2954336ba90a982fdb7e7d7f51e" +checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479" dependencies = [ "hashbrown 0.14.3", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 1532621940..df0d57990e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ once_cell = { version = "1.19.0", default-features = false } phf = { version = "0.11.2", default-features = false } pollster = "0.3.0" regex = "1.10.4" -regress = { version="0.9.0", features = ["utf16"]} +regress = { version="0.9.1", features = ["utf16"]} rustc-hash = { version = "1.1.0", default-features = false } serde_json = "1.0.115" serde = "1.0.197" diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 926852cd56..f75552b512 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -940,82 +940,74 @@ impl RegExp { // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16)); - // 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. - // 12. NOTE: Each element of input is considered to be a character. - - // 10. Let matchSucceeded be false. - // 13. Repeat, while matchSucceeded is false, - let match_value = loop { - // a. If lastIndex > length, then - if last_index > length { - // i. If global is true or sticky is true, then - if global || sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; - } + // NOTE: The following steps are take care of by regress: + // + // SKIP: 10. Let matchSucceeded be false. + // SKIP: 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. + // SKIP: 12. NOTE: Each element of input is considered to be a character. + // SKIP: 13. Repeat, while matchSucceeded is false, + + // 13.a. If lastIndex > length, then + if last_index > length { + // i. If global is true or sticky is true, then + if global || sticky { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; + } - // ii. Return null. - return Ok(None); + // ii. Return null. + return Ok(None); + } + + // 13.b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. + // 13.c. Let r be matcher(input, inputIndex). + let r: Option = if full_unicode { + matcher.find_from_utf16(input, last_index as usize).next() + } else { + matcher.find_from_ucs2(input, last_index as usize).next() + }; + + let Some(match_value) = r else { + // d. If r is failure, then + // + // NOTE: Merged the following steps (since we no longer have a loop): + // 13.d.i. If sticky is true, then + // 13.a.i. If global is true or sticky is true, then + if global || sticky { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; } - // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. - // c. Let r be matcher(input, inputIndex). - let r: Option = if full_unicode { - matcher.find_from_utf16(input, last_index as usize).next() - } else { - matcher.find_from_ucs2(input, last_index as usize).next() - }; + // MOVE: ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). + // NOTE: Handled within the regress matches iterator, see below for last_index assignment. - match r { - // d. If r is failure, then - None => { - // i. If sticky is true, then - if sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; + // NOTE: Merged and steps: + // 13.a.ii. Return null. + // 13.d.i.2. Return null. + return Ok(None); + }; - // 2. Return null. - return Ok(None); - } + // e. Else + // SKIP: i. Assert: r is a MatchState. + // SKIP: ii. Set matchSucceeded to true. - // ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). - last_index = advance_string_index(input, last_index, full_unicode); - } + // NOTE: regress currently doesn't support the sticky flag so we have to emulate it. + if sticky && match_value.start() != last_index as usize { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; - Some(m) => { - // d. If r is failure, then - #[allow(clippy::if_not_else)] - if m.start() as u64 != last_index { - // i. If sticky is true, then - if sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; - - // 2. Return null. - return Ok(None); - } + // 2. Return null. + return Ok(None); + } - // ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). - last_index = advance_string_index(input, last_index, full_unicode); - // e. Else, - } else { - // i. Assert: r is a State. - // ii. Set matchSucceeded to true. - break m; - } - } - } - }; + // 13.d.ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). + // NOTE: Calculation of last_index is done in regress. + last_index = match_value.start() as u64; // 14. Let e be r's endIndex value. - let e = match_value.end(); - - // Note: This is already taken care of be regress. // 15. If fullUnicode is true, set e to GetStringIndex(S, e). - // e is an index into the Input character list, derived from S, matched by matcher. - // Let eUTF be the smallest index into S that corresponds to the character at element e of Input. - // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S. - // b. Set e to eUTF. + // NOTE: Step 15 is already taken care of by regress. + let e = match_value.end(); // 16. If global is true or sticky is true, then if global || sticky {