Browse Source

Update regress to v0.8.0 and use UTF16 / UCS2 matching (#3627)

* Update regress to v0.8.0 and use UTF16 / UCS2 matching

* Fix test
pull/3626/head
raskad 10 months ago committed by GitHub
parent
commit
34d008469f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 22
      Cargo.lock
  2. 4
      Cargo.toml
  3. 60
      core/engine/src/builtins/regexp/mod.rs
  4. 2
      core/engine/src/builtins/regexp/tests.rs
  5. 9
      test262_config.toml

22
Cargo.lock generated

@ -49,6 +49,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]] [[package]]
name = "anes" name = "anes"
version = "0.1.6" version = "0.1.6"
@ -1502,15 +1508,6 @@ dependencies = [
"ahash 0.7.7", "ahash 0.7.7",
] ]
[[package]]
name = "hashbrown"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
dependencies = [
"ahash 0.8.7",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.14.3" version = "0.14.3"
@ -1518,6 +1515,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
dependencies = [ dependencies = [
"ahash 0.8.7", "ahash 0.8.7",
"allocator-api2",
] ]
[[package]] [[package]]
@ -2795,11 +2793,11 @@ dependencies = [
[[package]] [[package]]
name = "regress" name = "regress"
version = "0.7.1" version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ed9969cad8051328011596bf549629f1b800cf1731e7964b1eef8dfc480d2c2" checksum = "4f5f39ba4513916c1b2657b72af6ec671f091cd637992f58d0ede5cae4e5dea0"
dependencies = [ dependencies = [
"hashbrown 0.13.2", "hashbrown 0.14.3",
"memchr", "memchr",
] ]

4
Cargo.toml

@ -17,7 +17,7 @@ members = [
exclude = [ exclude = [
"tests/fuzz", # Does weird things on Windows tests "tests/fuzz", # Does weird things on Windows tests
"tests/src" # Just a hack to have fuzz inside tests "tests/src", # Just a hack to have fuzz inside tests
] ]
[workspace.package] [workspace.package]
@ -59,7 +59,7 @@ once_cell = { version = "1.19.0", default-features = false }
phf = { version = "0.11.2", default-features = false } phf = { version = "0.11.2", default-features = false }
pollster = "0.3.0" pollster = "0.3.0"
regex = "1.10.3" regex = "1.10.3"
regress = "0.7.1" regress = { version="0.8.0", features = ["utf16"]}
rustc-hash = { version = "1.1.0", default-features = false } rustc-hash = { version = "1.1.0", default-features = false }
serde_json = "1.0.113" serde_json = "1.0.113"
serde = "1.0.196" serde = "1.0.196"

60
core/engine/src/builtins/regexp/mod.rs

@ -904,14 +904,12 @@ impl RegExp {
// 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false.
let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16)); let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16));
// TODO:
// 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. // 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S.
// 12. NOTE: Each element of input is considered to be a character. // 12. NOTE: Each element of input is considered to be a character.
// 10. Let matchSucceeded be false. // 10. Let matchSucceeded be false.
// 13. Repeat, while matchSucceeded is false, // 13. Repeat, while matchSucceeded is false,
let lossy_input = input.to_std_string_escaped(); let match_value = loop {
let (match_value, last_byte_index) = loop {
// a. If lastIndex > length, then // a. If lastIndex > length, then
if last_index > length { if last_index > length {
// i. If global is true or sticky is true, then // i. If global is true or sticky is true, then
@ -925,18 +923,12 @@ impl RegExp {
} }
// b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S.
// Check if last_index is a valid utf8 index into input.
// TODO: avoid converting to String
let last_byte_index = match String::from_utf16(&input[..last_index as usize]) {
Ok(s) => s.len(),
Err(_) => {
return Err(JsNativeError::typ()
.with_message("Failed to get byte index from utf16 encoded string")
.into())
}
};
// c. Let r be matcher(input, inputIndex). // c. Let r be matcher(input, inputIndex).
let r = matcher.find_from(&lossy_input, last_byte_index).next(); let r: Option<regress::Match> = if full_unicode {
matcher.find_from_utf16(input, last_index as usize).next()
} else {
matcher.find_from_ucs2(input, last_index as usize).next()
};
match r { match r {
// d. If r is failure, then // d. If r is failure, then
@ -957,7 +949,7 @@ impl RegExp {
Some(m) => { Some(m) => {
// d. If r is failure, then // d. If r is failure, then
#[allow(clippy::if_not_else)] #[allow(clippy::if_not_else)]
if m.start() != last_byte_index { if m.start() as u64 != last_index {
// i. If sticky is true, then // i. If sticky is true, then
if sticky { if sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true). // 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
@ -969,38 +961,30 @@ impl RegExp {
// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). // ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
last_index = advance_string_index(input, last_index, full_unicode); last_index = advance_string_index(input, last_index, full_unicode);
// e. Else, // e. Else,
} else { } else {
// i. Assert: r is a State. // i. Assert: r is a State.
// ii. Set matchSucceeded to true. // ii. Set matchSucceeded to true.
break (m, last_byte_index); break m;
} }
} }
} }
}; };
// 14. Let e be r's endIndex value. // 14. Let e be r's endIndex value.
let mut e = match_value.end(); let e = match_value.end();
// Note: This is already taken care of be regress.
// 15. If fullUnicode is true, set e to GetStringIndex(S, e). // 15. If fullUnicode is true, set e to GetStringIndex(S, e).
// TODO: disabled for now until we have UTF-16 support // e is an index into the Input character list, derived from S, matched by matcher.
if false { // Let eUTF be the smallest index into S that corresponds to the character at element e of Input.
// e is an index into the Input character list, derived from S, matched by matcher. // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
// Let eUTF be the smallest index into S that corresponds to the character at element e of Input. // b. Set e to eUTF.
// If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
// b. Set e to eUTF.
e = input.get(..e).map_or_else(|| input.len(), <[u16]>::len);
}
// 16. If global is true or sticky is true, then // 16. If global is true or sticky is true, then
if global || sticky { if global || sticky {
// a. Perform ? Set(R, "lastIndex", 𝔽(e), true). // a. Perform ? Set(R, "lastIndex", 𝔽(e), true).
this.set( this.set(utf16!("lastIndex"), e, true, context)?;
utf16!("lastIndex"),
lossy_input[..e].encode_utf16().count(),
true,
context,
)?;
} }
// 17. Let n be the number of elements in r's captures List. // 17. Let n be the number of elements in r's captures List.
@ -1039,7 +1023,7 @@ impl RegExp {
.expect("this CreateDataPropertyOrThrow call must not fail"); .expect("this CreateDataPropertyOrThrow call must not fail");
// 28. Let matchedSubstr be GetMatchString(S, match). // 28. Let matchedSubstr be GetMatchString(S, match).
let matched_substr = js_string!(&lossy_input[last_byte_index..e]); let matched_substr = js_string!(&input[(last_index as usize)..(e)]);
// 29. Perform ! CreateDataPropertyOrThrow(A, "0", matchedSubstr). // 29. Perform ! CreateDataPropertyOrThrow(A, "0", matchedSubstr).
a.create_data_property_or_throw(0, matched_substr, context) a.create_data_property_or_throw(0, matched_substr, context)
@ -1069,8 +1053,7 @@ impl RegExp {
for (name, range) in named_groups { for (name, range) in named_groups {
let name = js_string!(name); let name = js_string!(name);
if let Some(range) = range { if let Some(range) = range {
// TODO: Full UTF-16 regex support let value = js_string!(&input[range.clone()]);
let value = js_string!(&lossy_input[range.clone()]);
groups groups
.create_data_property_or_throw(name.clone(), value, context) .create_data_property_or_throw(name.clone(), value, context)
@ -1130,10 +1113,9 @@ impl RegExp {
// b. If captureI is undefined, let capturedValue be undefined. // b. If captureI is undefined, let capturedValue be undefined.
// c. Else if fullUnicode is true, then // c. Else if fullUnicode is true, then
// d. Else, // d. Else,
// TODO: Full UTF-16 regex support let captured_value = capture
let captured_value = capture.clone().map_or_else(JsValue::undefined, |range| { .clone()
js_string!(&lossy_input[range]).into() .map_or_else(JsValue::undefined, |range| js_string!(&input[range]).into());
});
// e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(i)), capturedValue). // e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(i)), capturedValue).
a.create_data_property_or_throw(i, captured_value.clone(), context) a.create_data_property_or_throw(i, captured_value.clone(), context)

2
core/engine/src/builtins/regexp/tests.rs

@ -125,7 +125,7 @@ fn no_panic_on_parse_fail() {
TestAction::assert_native_error( TestAction::assert_native_error(
r"var re = /]/u;", r"var re = /]/u;",
JsNativeErrorKind::Syntax, JsNativeErrorKind::Syntax,
"Invalid regular expression literal: Unbalanced bracket at line 1, col 10", "Invalid regular expression literal: Invalid atom character at line 1, col 10",
), ),
TestAction::assert_native_error( TestAction::assert_native_error(
r"var re = /a{/u;", r"var re = /a{/u;",

9
test262_config.toml

@ -16,6 +16,7 @@ features = [
"Intl.RelativeTimeFormat", "Intl.RelativeTimeFormat",
"Intl-enumeration", "Intl-enumeration",
"Intl.NumberFormat-v3", "Intl.NumberFormat-v3",
"regexp-v-flag",
### Pending proposals ### Pending proposals
@ -65,12 +66,6 @@ features = [
### Non-standard ### Non-standard
"caller", "caller",
### RegExp tests that check individual codepoints.
### They are not useful considering the cpu time they waste.
"regexp-unicode-property-escapes",
] ]
# RegExp tests that check individual codepoints. tests = ["NumberFormat"]
# They are not useful considering the cpu time they waste.
tests = ["CharacterClassEscapes", "NumberFormat"]

Loading…
Cancel
Save