From a7c2f5baa7845301263c15f9b1427f2d5279718e Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:41:35 +0100 Subject: [PATCH] Implement RegExp `v` flag (#3695) --- Cargo.lock | 4 +- Cargo.toml | 2 +- core/engine/src/builtins/regexp/mod.rs | 124 ++++++++++++++++--------- core/parser/src/lexer/regex.rs | 12 +++ test262_config.toml | 1 - 5 files changed, 95 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 70ddf2ff13..b0bf0fd12c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3058,9 +3058,9 @@ dependencies = [ [[package]] name = "regress" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f5f39ba4513916c1b2657b72af6ec671f091cd637992f58d0ede5cae4e5dea0" +checksum = "d06f9a1f7cd8473611ba1a480cf35f9c5cffc2954336ba90a982fdb7e7d7f51e" dependencies = [ "hashbrown 0.14.3", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 3bfafe3b88..b940e37b60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ once_cell = { version = "1.19.0", default-features = false } phf = { version = "0.11.2", default-features = false } pollster = "0.3.0" regex = "1.10.3" -regress = { version="0.8.0", features = ["utf16"]} +regress = { version="0.9.0", features = ["utf16"]} rustc-hash = { version = "1.1.0", default-features = false } serde_json = "1.0.114" serde = "1.0.197" diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index f45d2361f2..926852cd56 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -75,6 +75,9 @@ impl IntrinsicObject for RegExp { let get_unicode = BuiltInBuilder::callable(realm, Self::get_unicode) .name(js_string!("get unicode")) .build(); + let get_unicode_sets = BuiltInBuilder::callable(realm, Self::get_unicode_sets) + .name(js_string!("get unicodeSets")) + .build(); let get_sticky = BuiltInBuilder::callable(realm, Self::get_sticky) .name(js_string!("get sticky")) .build(); @@ -136,6 +139,12 @@ impl IntrinsicObject for RegExp { None, flag_attributes, ) + .accessor( + js_string!("unicodeSets"), + Some(get_unicode_sets), + None, + flag_attributes, + ) .accessor( js_string!("sticky"), Some(get_sticky), @@ -427,6 +436,7 @@ impl RegExp { b's' => regexp.flags.contains(RegExpFlags::DOT_ALL), b'i' => regexp.flags.contains(RegExpFlags::IGNORE_CASE), b'u' => regexp.flags.contains(RegExpFlags::UNICODE), + b'v' => regexp.flags.contains(RegExpFlags::UNICODE_SETS), b'y' => regexp.flags.contains(RegExpFlags::STICKY), _ => unreachable!(), })); @@ -447,6 +457,7 @@ impl RegExp { b's' => "dotAll", b'i' => "ignoreCase", b'u' => "unicode", + b'v' => "unicodeSets", b'y' => "sticky", _ => unreachable!(), }; @@ -565,6 +576,22 @@ impl RegExp { Self::regexp_has_flag(this, b'u', context) } + /// `get RegExp.prototype.unicodeSets` + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-get-regexp.prototype.unicodesets + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicodeSets + pub(crate) fn get_unicode_sets( + this: &JsValue, + _: &[JsValue], + context: &mut Context, + ) -> JsResult { + Self::regexp_has_flag(this, b'v', context) + } + /// `get RegExp.prototype.sticky` /// /// This flag indicates that it matches only from the index indicated by the `lastIndex` property @@ -601,58 +628,67 @@ impl RegExp { context: &mut Context, ) -> JsResult { // 1. Let R be the this value. - // 2. If Type(R) is not Object, throw a TypeError exception. - if let Some(object) = this.as_object() { - // 3. Let result be the empty String. - let mut result = String::new(); + // 2. If R is not an Object, throw a TypeError exception. + let Some(object) = this.as_object() else { + return Err(JsNativeError::typ() + .with_message("RegExp.prototype.flags getter called on non-object") + .into()); + }; - // 4. Let hasIndices be ToBoolean(? Get(R, "hasIndices")). - // 5. If hasIndices is true, append the code unit 0x0064 (LATIN SMALL LETTER D) as the last code unit of result. - if object.get(utf16!("hasIndices"), context)?.to_boolean() { - result.push('d'); - } + // 3. Let codeUnits be a new empty List. + let mut code_units = Vec::new(); - // 6. Let global be ! ToBoolean(? Get(R, "global")). - // 7. If global is true, append the code unit 0x0067 (LATIN SMALL LETTER G) as the last code unit of result. - if object.get(utf16!("global"), context)?.to_boolean() { - result.push('g'); - } - // 8. Let ignoreCase be ! ToBoolean(? Get(R, "ignoreCase")). - // 9. If ignoreCase is true, append the code unit 0x0069 (LATIN SMALL LETTER I) as the last code unit of result. - if object.get(utf16!("ignoreCase"), context)?.to_boolean() { - result.push('i'); - } + // 4. Let hasIndices be ToBoolean(? Get(R, "hasIndices")). + // 5. If hasIndices is true, append the code unit 0x0064 (LATIN SMALL LETTER D) to codeUnits. + if object.get(utf16!("hasIndices"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("d")); + } - // 10. Let multiline be ! ToBoolean(? Get(R, "multiline")). - // 11. If multiline is true, append the code unit 0x006D (LATIN SMALL LETTER M) as the last code unit of result. - if object.get(utf16!("multiline"), context)?.to_boolean() { - result.push('m'); - } + // 6. Let global be ToBoolean(? Get(R, "global")). + // 7. If global is true, append the code unit 0x0067 (LATIN SMALL LETTER G) to codeUnits. + if object.get(utf16!("global"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("g")); + } - // 12. Let dotAll be ! ToBoolean(? Get(R, "dotAll")). - // 13. If dotAll is true, append the code unit 0x0073 (LATIN SMALL LETTER S) as the last code unit of result. - if object.get(utf16!("dotAll"), context)?.to_boolean() { - result.push('s'); - } - // 14. Let unicode be ! ToBoolean(? Get(R, "unicode")). - // 15. If unicode is true, append the code unit 0x0075 (LATIN SMALL LETTER U) as the last code unit of result. - if object.get(utf16!("unicode"), context)?.to_boolean() { - result.push('u'); - } + // 8. Let ignoreCase be ToBoolean(? Get(R, "ignoreCase")). + // 9. If ignoreCase is true, append the code unit 0x0069 (LATIN SMALL LETTER I) to codeUnits. + if object.get(utf16!("ignoreCase"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("i")); + } - // 16. Let sticky be ! ToBoolean(? Get(R, "sticky")). - // 17. If sticky is true, append the code unit 0x0079 (LATIN SMALL LETTER Y) as the last code unit of result. - if object.get(utf16!("sticky"), context)?.to_boolean() { - result.push('y'); - } + // 10. Let multiline be ToBoolean(? Get(R, "multiline")). + // 11. If multiline is true, append the code unit 0x006D (LATIN SMALL LETTER M) to codeUnits. + if object.get(utf16!("multiline"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("m")); + } - // 18. Return result. - return Ok(js_string!(result).into()); + // 12. Let dotAll be ToBoolean(? Get(R, "dotAll")). + // 13. If dotAll is true, append the code unit 0x0073 (LATIN SMALL LETTER S) to codeUnits. + if object.get(utf16!("dotAll"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("s")); } - Err(JsNativeError::typ() - .with_message("RegExp.prototype.flags getter called on non-object") - .into()) + // 14. Let unicode be ToBoolean(? Get(R, "unicode")). + // 15. If unicode is true, append the code unit 0x0075 (LATIN SMALL LETTER U) to codeUnits. + if object.get(utf16!("unicode"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("u")); + } + + // 16. Let unicodeSets be ToBoolean(? Get(R, "unicodeSets")). + // 17. If unicodeSets is true, append the code unit 0x0076 (LATIN SMALL LETTER V) to codeUnits. + if object.get(utf16!("unicodeSets"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("v")); + } + + // 18. Let sticky be ToBoolean(? Get(R, "sticky")). + // 19. If sticky is true, append the code unit 0x0079 (LATIN SMALL LETTER Y) to codeUnits. + if object.get(utf16!("sticky"), context)?.to_boolean() { + code_units.extend_from_slice(utf16!("y")); + } + + // 20. Return the String value whose code units are the elements of the List codeUnits. + // If codeUnits has no elements, the empty String is returned. + Ok(JsString::from(code_units).into()) } /// `get RegExp.prototype.source` diff --git a/core/parser/src/lexer/regex.rs b/core/parser/src/lexer/regex.rs index 2a99e013f3..999ac476bb 100644 --- a/core/parser/src/lexer/regex.rs +++ b/core/parser/src/lexer/regex.rs @@ -169,6 +169,9 @@ bitflags! { /// Whether the regular expression result exposes the start and end indices of /// captured substrings. const HAS_INDICES = 0b0100_0000; + + /// Whether or not UnicodeSets features are enabled. + const UNICODE_SETS = 0b1000_0000; } } @@ -186,6 +189,7 @@ impl FromStr for RegExpFlags { b'u' => Self::UNICODE, b'y' => Self::STICKY, b'd' => Self::HAS_INDICES, + b'v' => Self::UNICODE_SETS, _ => return Err(format!("invalid regular expression flag {}", char::from(c))), }; @@ -198,6 +202,10 @@ impl FromStr for RegExpFlags { flags.insert(new_flag); } + if flags.contains(Self::UNICODE) && flags.contains(Self::UNICODE_SETS) { + return Err("cannot use both 'u' and 'v' flags".into()); + } + Ok(flags) } } @@ -233,6 +241,9 @@ impl ToString for RegExpFlags { if self.contains(Self::STICKY) { s.push('y'); } + if self.contains(Self::UNICODE_SETS) { + s.push('v'); + } s } } @@ -244,6 +255,7 @@ impl From for Flags { multiline: value.contains(RegExpFlags::MULTILINE), dot_all: value.contains(RegExpFlags::DOT_ALL), unicode: value.contains(RegExpFlags::UNICODE), + unicode_sets: value.contains(RegExpFlags::UNICODE_SETS), ..Self::default() } } diff --git a/test262_config.toml b/test262_config.toml index ea717923d1..28740c7a32 100644 --- a/test262_config.toml +++ b/test262_config.toml @@ -13,7 +13,6 @@ features = [ "Intl.DisplayNames", "Intl.RelativeTimeFormat", "Intl-enumeration", - "regexp-v-flag", ### Pending proposals