Browse Source

Optimize Regex match check (#3779)

* Initial optimization

* Fix sticky flag

* Apply review

* Update regress to 0.9.1

* Add spec deviation comments

---------

Co-authored-by: raskad <32105367+raskad@users.noreply.github.com>
pull/3787/head
Haled Odat 8 months ago committed by GitHub
parent
commit
b1f07805c3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 4
      Cargo.lock
  2. 2
      Cargo.toml
  3. 74
      core/engine/src/builtins/regexp/mod.rs

4
Cargo.lock generated

@ -3074,9 +3074,9 @@ dependencies = [
[[package]] [[package]]
name = "regress" name = "regress"
version = "0.9.0" version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d06f9a1f7cd8473611ba1a480cf35f9c5cffc2954336ba90a982fdb7e7d7f51e" checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479"
dependencies = [ dependencies = [
"hashbrown 0.14.3", "hashbrown 0.14.3",
"memchr", "memchr",

2
Cargo.toml

@ -64,7 +64,7 @@ once_cell = { version = "1.19.0", default-features = false }
phf = { version = "0.11.2", default-features = false } phf = { version = "0.11.2", default-features = false }
pollster = "0.3.0" pollster = "0.3.0"
regex = "1.10.4" regex = "1.10.4"
regress = { version="0.9.0", features = ["utf16"]} regress = { version="0.9.1", features = ["utf16"]}
rustc-hash = { version = "1.1.0", default-features = false } rustc-hash = { version = "1.1.0", default-features = false }
serde_json = "1.0.115" serde_json = "1.0.115"
serde = "1.0.197" serde = "1.0.197"

74
core/engine/src/builtins/regexp/mod.rs

@ -940,13 +940,14 @@ impl RegExp {
// 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false.
let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16)); let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16));
// 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. // NOTE: The following steps are take care of by regress:
// 12. NOTE: Each element of input is considered to be a character. //
// SKIP: 10. Let matchSucceeded be false.
// SKIP: 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S.
// SKIP: 12. NOTE: Each element of input is considered to be a character.
// SKIP: 13. Repeat, while matchSucceeded is false,
// 10. Let matchSucceeded be false. // 13.a. If lastIndex > length, then
// 13. Repeat, while matchSucceeded is false,
let match_value = loop {
// a. If lastIndex > length, then
if last_index > length { if last_index > length {
// i. If global is true or sticky is true, then // i. If global is true or sticky is true, then
if global || sticky { if global || sticky {
@ -958,36 +959,40 @@ impl RegExp {
return Ok(None); return Ok(None);
} }
// b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. // 13.b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S.
// c. Let r be matcher(input, inputIndex). // 13.c. Let r be matcher(input, inputIndex).
let r: Option<regress::Match> = if full_unicode { let r: Option<regress::Match> = if full_unicode {
matcher.find_from_utf16(input, last_index as usize).next() matcher.find_from_utf16(input, last_index as usize).next()
} else { } else {
matcher.find_from_ucs2(input, last_index as usize).next() matcher.find_from_ucs2(input, last_index as usize).next()
}; };
match r { let Some(match_value) = r else {
// d. If r is failure, then // d. If r is failure, then
None => { //
// i. If sticky is true, then // NOTE: Merged the following steps (since we no longer have a loop):
if sticky { // 13.d.i. If sticky is true, then
// 13.a.i. If global is true or sticky is true, then
if global || sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true). // 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?; this.set(utf16!("lastIndex"), 0, true, context)?;
}
// 2. Return null. // MOVE: ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
// NOTE: Handled within the regress matches iterator, see below for last_index assignment.
// NOTE: Merged and steps:
// 13.a.ii. Return null.
// 13.d.i.2. Return null.
return Ok(None); return Ok(None);
} };
// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). // e. Else
last_index = advance_string_index(input, last_index, full_unicode); // SKIP: i. Assert: r is a MatchState.
} // SKIP: ii. Set matchSucceeded to true.
Some(m) => { // NOTE: regress currently doesn't support the sticky flag so we have to emulate it.
// d. If r is failure, then if sticky && match_value.start() != last_index as usize {
#[allow(clippy::if_not_else)]
if m.start() as u64 != last_index {
// i. If sticky is true, then
if sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true). // 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?; this.set(utf16!("lastIndex"), 0, true, context)?;
@ -995,27 +1000,14 @@ impl RegExp {
return Ok(None); return Ok(None);
} }
// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). // 13.d.ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
last_index = advance_string_index(input, last_index, full_unicode); // NOTE: Calculation of last_index is done in regress.
// e. Else, last_index = match_value.start() as u64;
} else {
// i. Assert: r is a State.
// ii. Set matchSucceeded to true.
break m;
}
}
}
};
// 14. Let e be r's endIndex value. // 14. Let e be r's endIndex value.
let e = match_value.end();
// Note: This is already taken care of be regress.
// 15. If fullUnicode is true, set e to GetStringIndex(S, e). // 15. If fullUnicode is true, set e to GetStringIndex(S, e).
// e is an index into the Input character list, derived from S, matched by matcher. // NOTE: Step 15 is already taken care of by regress.
// Let eUTF be the smallest index into S that corresponds to the character at element e of Input. let e = match_value.end();
// If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
// b. Set e to eUTF.
// 16. If global is true or sticky is true, then // 16. If global is true or sticky is true, then
if global || sticky { if global || sticky {

Loading…
Cancel
Save