From a7ebfc8f3a7918166b2e5d959e8ca264a9c5936e Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Tue, 27 Jul 2021 14:08:19 +0200 Subject: [PATCH] Implement RegExp named capture groups (#1390) --- boa/src/builtins/regexp/mod.rs | 102 +++++++++++++++++++++------------ boa/src/builtins/string/mod.rs | 45 +++++++++++++-- 2 files changed, 106 insertions(+), 41 deletions(-) diff --git a/boa/src/builtins/regexp/mod.rs b/boa/src/builtins/regexp/mod.rs index 2018d6bb89..2b0cef2025 100644 --- a/boa/src/builtins/regexp/mod.rs +++ b/boa/src/builtins/regexp/mod.rs @@ -12,7 +12,7 @@ pub mod regexp_string_iterator; use crate::{ - builtins::{array::Array, BuiltIn}, + builtins::{array::Array, string, BuiltIn}, gc::{empty_trace, Finalize, Trace}, object::{ConstructorBuilder, FunctionBuilder, GcObject, ObjectData, PROTOTYPE}, property::Attribute, @@ -712,8 +712,7 @@ impl RegExp { // 2. Assert: Type(S) is String. // 3. Let length be the number of code units in S. - // Regress only works with utf8. According to the spec we would use the utf16 encoded count. - let length = input.chars().count(); + let length = input.encode_utf16().count(); // 4. Let lastIndex be ℝ(? ToLength(? Get(R, "lastIndex"))). let mut last_index = this.get_field("lastIndex", context)?.to_length(context)?; @@ -755,10 +754,16 @@ impl RegExp { // b. Let r be matcher(S, lastIndex). // Check if last_index is a valid utf8 index into input. - if input.get(last_index..).is_none() { - return Ok(Value::null()); - } - let r = matcher.find_from(&input, last_index).next(); + let last_byte_index = match String::from_utf16( + &input.encode_utf16().take(last_index).collect::>(), + ) { + Ok(s) => s.len(), + Err(_) => { + return context + .throw_type_error("Failed to get byte index from utf16 encoded string") + } + }; + let r = matcher.find_from(&input, last_byte_index).next(); match r { // c. If r is failure, then @@ -809,8 +814,7 @@ impl RegExp { // Let eUTF be the smallest index into S that corresponds to the character at element e of Input. // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S. // b. Set e to eUTF. - // Regress only works with utf8. According to the spec we would use the utf16 encoded count. - e = input.split_at(e).0.chars().count() + 1; + e = input.split_at(e).0.encode_utf16().count(); } // 15. If global is true or sticky is true, then @@ -847,12 +851,36 @@ impl RegExp { a.create_data_property_or_throw(0, matched_substr, context) .unwrap(); - // TODO: named capture groups // 24. If R contains any GroupName, then - // a. Let groups be ! OrdinaryObjectCreate(null). // 25. Else, - // a. Let groups be undefined. - let groups = Value::undefined(); + let named_groups = match_value.named_groups(); + let groups = if named_groups.clone().count() > 0 { + // a. Let groups be ! OrdinaryObjectCreate(null). + let groups = Value::new_object(context); + + // Perform 27.f here + // f. If the ith capture of R was defined with a GroupName, then + // i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName. + // ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue). + for (name, range) in named_groups { + if let Some(range) = range { + let value = if let Some(s) = input.get(range.clone()) { + s + } else { + "" + }; + + groups + .to_object(context)? + .create_data_property_or_throw(name, value, context) + .unwrap(); + } + } + groups + } else { + // a. Let groups be undefined. + Value::undefined() + }; // 26. Perform ! CreateDataPropertyOrThrow(A, "groups", groups). a.create_data_property_or_throw("groups", groups, context) @@ -880,11 +908,6 @@ impl RegExp { // e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(i)), capturedValue). a.create_data_property_or_throw(i, captured_value, context) .unwrap(); - - // TODO: named capture groups - // f. If the ith capture of R was defined with a GroupName, then - // i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName. - // ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue). } // 28. Return A. @@ -931,7 +954,7 @@ impl RegExp { let unicode = this.get_field("unicode", context)?.to_boolean(); // c. Perform ? Set(rx, "lastIndex", +0𝔽, true). - this.set_field("lastIndex", Value::from(0), true, context)?; + this.set_field("lastIndex", 0, true, context)?; // d. Let A be ! ArrayCreate(0). let a = Array::array_create(0, None, context).unwrap(); @@ -1099,8 +1122,7 @@ impl RegExp { .to_string(context)?; // 4. Let lengthS be the number of code unit elements in S. - // Regress only works with utf8. According to the spec we would use the utf16 encoded count. - let length_arg_str = arg_str.chars().count(); + let length_arg_str = arg_str.encode_utf16().count(); // 5. Let functionalReplace be IsCallable(replaceValue). let replace_value = args.get(1).cloned().unwrap_or_default(); @@ -1179,8 +1201,7 @@ impl RegExp { let matched = result.get_field("0", context)?.to_string(context)?; // d. Let matchLength be the number of code units in matched. - // Regress only works with utf8. According to the spec we would use the utf16 encoded count. - let match_length = matched.chars().count(); + let match_length = matched.encode_utf16().count(); // e. Let position be ? ToIntegerOrInfinity(? Get(result, "index")). let position = result @@ -1260,13 +1281,14 @@ impl RegExp { } // ii. Let replacement be ? GetSubstitution(matched, S, position, captures, namedCaptures, replaceValue). - replacement = crate::builtins::string::get_substitution( + replacement = string::get_substitution( matched.to_string(), arg_str.to_string(), position, captures, named_captures, replace_value.to_string(context)?.to_string(), + context, )?; } @@ -1433,7 +1455,7 @@ impl RegExp { } // 15. Let size be the length of S. - let size = arg_str.chars().count(); + let size = arg_str.encode_utf16().count(); // 16. If size is 0, then if size == 0 { @@ -1485,10 +1507,13 @@ impl RegExp { q = advance_string_index(arg_str.clone(), q, unicode); } else { // 1. Let T be the substring of S from p to q. - //let arg_str_substring = arg_str - // .get(p..q) - // .expect("invalid index into string to split"); - let arg_str_substring: String = arg_str.chars().skip(p).take(q - p).collect(); + let arg_str_substring = String::from_utf16_lossy( + &arg_str + .encode_utf16() + .skip(p) + .take(q - p) + .collect::>(), + ); // 2. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(lengthA)), T). a.create_data_property_or_throw(length_a, arg_str_substring, context) @@ -1542,7 +1567,13 @@ impl RegExp { } // 20. Let T be the substring of S from p to size. - let arg_str_substring: String = arg_str.chars().skip(p).take(size - p).collect(); + let arg_str_substring = String::from_utf16_lossy( + &arg_str + .encode_utf16() + .skip(p) + .take(size - p) + .collect::>(), + ); // 21. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(lengthA)), T). a.create_data_property_or_throw(length_a, arg_str_substring, context) @@ -1570,7 +1601,7 @@ fn advance_string_index(s: JsString, index: usize, unicode: bool) -> usize { } // 3. Let length be the number of code units in S. - let length = s.chars().count(); + let length = s.encode_utf16().count(); // 4. If index + 1 ≥ length, return index + 1. if index + 1 > length { @@ -1578,11 +1609,8 @@ fn advance_string_index(s: JsString, index: usize, unicode: bool) -> usize { } // 5. Let cp be ! CodePointAt(S, index). - let offset = if let Some(c) = s.chars().nth(index) { - c.len_utf8() - } else { - 1 - }; + let (_, offset, _) = + crate::builtins::string::code_point_at(s, index as i32).expect("Failed to get code point"); - index + offset + index + offset as usize } diff --git a/boa/src/builtins/string/mod.rs b/boa/src/builtins/string/mod.rs index cbe2fb28dc..7770adb716 100644 --- a/boa/src/builtins/string/mod.rs +++ b/boa/src/builtins/string/mod.rs @@ -583,7 +583,7 @@ impl String { length } else { args.get(1) - .expect("Could not get argumetn") + .expect("Could not get argument") .to_integer(context)? as i32 }; @@ -746,6 +746,7 @@ impl String { captures, Value::undefined(), replace_value.to_string(context)?.to_string(), + context, )? }; @@ -1511,8 +1512,9 @@ pub(crate) fn get_substitution( str: StdString, position: usize, captures: Vec, - _named_captures: Value, + named_captures: Value, replacement: StdString, + context: &mut Context, ) -> Result { // 1. Assert: Type(matched) is String. @@ -1626,9 +1628,44 @@ pub(crate) fn get_substitution( } // $< (Some('<'), _) => { - // TODO: named capture groups // 1. If namedCaptures is undefined, the replacement text is the String "$<". - result.push_str("$<"); + // 2. Else, + if named_captures.is_undefined() { + result.push_str("$<") + } else { + // a. Assert: Type(namedCaptures) is Object. + + // b. Scan until the next > U+003E (GREATER-THAN SIGN). + let mut group_name = StdString::new(); + let mut found = false; + loop { + match chars.next() { + Some('>') => { + found = true; + break; + } + Some(c) => group_name.push(c), + None => break, + } + } + + // c. If none is found, the replacement text is the String "$<". + // d. Else, + if !found { + result.push_str("$<"); + result.push_str(&group_name); + } else { + // i. Let groupName be the enclosed substring. + // ii. Let capture be ? Get(namedCaptures, groupName). + let capture = named_captures.get_field(group_name, context)?; + + // iii. If capture is undefined, replace the text through > with the empty String. + // iv. Otherwise, replace the text through > with ? ToString(capture). + if !capture.is_undefined() { + result.push_str(capture.to_string(context)?.as_str()); + } + } + } } // $?, ? is none of the above _ => {