Implement RegExp named capture groups (#1390)

3 years ago · a7ebfc8f3a
2 changed files with 106 additions and 41 deletions
--- a/boa/src/builtins/regexp/mod.rs
+++ b/boa/src/builtins/regexp/mod.rs
@ -12,7 +12,7 @@
 pub mod regexp_string_iterator;

 use crate::{
-    builtins::{array::Array, BuiltIn},
+    builtins::{array::Array, string, BuiltIn},
    gc::{empty_trace, Finalize, Trace},
    object::{ConstructorBuilder, FunctionBuilder, GcObject, ObjectData, PROTOTYPE},
    property::Attribute,
@ -712,8 +712,7 @@ impl RegExp {
        // 2. Assert: Type(S) is String.

        // 3. Let length be the number of code units in S.
-        // Regress only works with utf8. According to the spec we would use the utf16 encoded count.
-        let length = input.chars().count();
+        let length = input.encode_utf16().count();

        // 4. Let lastIndex be ℝ(? ToLength(? Get(R, "lastIndex"))).
        let mut last_index = this.get_field("lastIndex", context)?.to_length(context)?;
@ -755,10 +754,16 @@ impl RegExp {

            // b. Let r be matcher(S, lastIndex).
            // Check if last_index is a valid utf8 index into input.
-            if input.get(last_index..).is_none() {
-                return Ok(Value::null());
-            }
-            let r = matcher.find_from(&input, last_index).next();
+            let last_byte_index = match String::from_utf16(
+                &input.encode_utf16().take(last_index).collect::<Vec<u16>>(),
+            ) {
+                Ok(s) => s.len(),
+                Err(_) => {
+                    return context
+                        .throw_type_error("Failed to get byte index from utf16 encoded string")
+                }
+            };
+            let r = matcher.find_from(&input, last_byte_index).next();

            match r {
                // c. If r is failure, then
@ -809,8 +814,7 @@ impl RegExp {
            // Let eUTF be the smallest index into S that corresponds to the character at element e of Input.
            // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
            // b. Set e to eUTF.
-            // Regress only works with utf8. According to the spec we would use the utf16 encoded count.
-            e = input.split_at(e).0.chars().count() + 1;
+            e = input.split_at(e).0.encode_utf16().count();
        }

        // 15. If global is true or sticky is true, then
@ -847,12 +851,36 @@ impl RegExp {
        a.create_data_property_or_throw(0, matched_substr, context)
            .unwrap();

-        // TODO: named capture groups
        // 24. If R contains any GroupName, then
-        //     a. Let groups be ! OrdinaryObjectCreate(null).
        // 25. Else,
-        //     a. Let groups be undefined.
-        let groups = Value::undefined();
+        let named_groups = match_value.named_groups();
+        let groups = if named_groups.clone().count() > 0 {
+            // a. Let groups be ! OrdinaryObjectCreate(null).
+            let groups = Value::new_object(context);
+
+            // Perform 27.f here
+            // f. If the ith capture of R was defined with a GroupName, then
+            // i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName.
+            // ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue).
+            for (name, range) in named_groups {
+                if let Some(range) = range {
+                    let value = if let Some(s) = input.get(range.clone()) {
+                        s
+                    } else {
+                        ""
+                    };
+
+                    groups
+                        .to_object(context)?
+                        .create_data_property_or_throw(name, value, context)
+                        .unwrap();
+                }
+            }
+            groups
+        } else {
+            // a. Let groups be undefined.
+            Value::undefined()
+        };

        // 26. Perform ! CreateDataPropertyOrThrow(A, "groups", groups).
        a.create_data_property_or_throw("groups", groups, context)
@ -880,11 +908,6 @@ impl RegExp {
            // e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(i)), capturedValue).
            a.create_data_property_or_throw(i, captured_value, context)
                .unwrap();
-
-            // TODO: named capture groups
-            // f. If the ith capture of R was defined with a GroupName, then
-            // i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName.
-            // ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue).
        }

        // 28. Return A.
@ -931,7 +954,7 @@ impl RegExp {
            let unicode = this.get_field("unicode", context)?.to_boolean();

            // c. Perform ? Set(rx, "lastIndex", +0𝔽, true).
-            this.set_field("lastIndex", Value::from(0), true, context)?;
+            this.set_field("lastIndex", 0, true, context)?;

            // d. Let A be ! ArrayCreate(0).
            let a = Array::array_create(0, None, context).unwrap();
@ -1099,8 +1122,7 @@ impl RegExp {
            .to_string(context)?;

        // 4. Let lengthS be the number of code unit elements in S.
-        // Regress only works with utf8. According to the spec we would use the utf16 encoded count.
-        let length_arg_str = arg_str.chars().count();
+        let length_arg_str = arg_str.encode_utf16().count();

        // 5. Let functionalReplace be IsCallable(replaceValue).
        let replace_value = args.get(1).cloned().unwrap_or_default();
@ -1179,8 +1201,7 @@ impl RegExp {
            let matched = result.get_field("0", context)?.to_string(context)?;

            // d. Let matchLength be the number of code units in matched.
-            // Regress only works with utf8. According to the spec we would use the utf16 encoded count.
-            let match_length = matched.chars().count();
+            let match_length = matched.encode_utf16().count();

            // e. Let position be ? ToIntegerOrInfinity(? Get(result, "index")).
            let position = result
@ -1260,13 +1281,14 @@ impl RegExp {
                }

                // ii. Let replacement be ? GetSubstitution(matched, S, position, captures, namedCaptures, replaceValue).
-                replacement = crate::builtins::string::get_substitution(
+                replacement = string::get_substitution(
                    matched.to_string(),
                    arg_str.to_string(),
                    position,
                    captures,
                    named_captures,
                    replace_value.to_string(context)?.to_string(),
+                    context,
                )?;
            }

@ -1433,7 +1455,7 @@ impl RegExp {
        }

        // 15. Let size be the length of S.
-        let size = arg_str.chars().count();
+        let size = arg_str.encode_utf16().count();

        // 16. If size is 0, then
        if size == 0 {
@ -1485,10 +1507,13 @@ impl RegExp {
                    q = advance_string_index(arg_str.clone(), q, unicode);
                } else {
                    // 1. Let T be the substring of S from p to q.
-                    //let arg_str_substring = arg_str
-                    //    .get(p..q)
-                    //    .expect("invalid index into string to split");
-                    let arg_str_substring: String = arg_str.chars().skip(p).take(q - p).collect();
+                    let arg_str_substring = String::from_utf16_lossy(
+                        &arg_str
+                            .encode_utf16()
+                            .skip(p)
+                            .take(q - p)
+                            .collect::<Vec<u16>>(),
+                    );

                    // 2. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(lengthA)), T).
                    a.create_data_property_or_throw(length_a, arg_str_substring, context)
@ -1542,7 +1567,13 @@ impl RegExp {
        }

        // 20. Let T be the substring of S from p to size.
-        let arg_str_substring: String = arg_str.chars().skip(p).take(size - p).collect();
+        let arg_str_substring = String::from_utf16_lossy(
+            &arg_str
+                .encode_utf16()
+                .skip(p)
+                .take(size - p)
+                .collect::<Vec<u16>>(),
+        );

        // 21. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(lengthA)), T).
        a.create_data_property_or_throw(length_a, arg_str_substring, context)
@ -1570,7 +1601,7 @@ fn advance_string_index(s: JsString, index: usize, unicode: bool) -> usize {
    }

    // 3. Let length be the number of code units in S.
-    let length = s.chars().count();
+    let length = s.encode_utf16().count();

    // 4. If index + 1 ≥ length, return index + 1.
    if index + 1 > length {
@ -1578,11 +1609,8 @@ fn advance_string_index(s: JsString, index: usize, unicode: bool) -> usize {
    }

    // 5. Let cp be ! CodePointAt(S, index).
-    let offset = if let Some(c) = s.chars().nth(index) {
-        c.len_utf8()
-    } else {
-        1
-    };
+    let (_, offset, _) =
+        crate::builtins::string::code_point_at(s, index as i32).expect("Failed to get code point");

-    index + offset
+    index + offset as usize
 }
--- a/boa/src/builtins/string/mod.rs
+++ b/boa/src/builtins/string/mod.rs
@ -583,7 +583,7 @@ impl String {
            length
        } else {
            args.get(1)
-                .expect("Could not get argumetn")
+                .expect("Could not get argument")
                .to_integer(context)? as i32
        };

@ -746,6 +746,7 @@ impl String {
                captures,
                Value::undefined(),
                replace_value.to_string(context)?.to_string(),
+                context,
            )?
        };

@ -1511,8 +1512,9 @@ pub(crate) fn get_substitution(
    str: StdString,
    position: usize,
    captures: Vec<Value>,
-    _named_captures: Value,
+    named_captures: Value,
    replacement: StdString,
+    context: &mut Context,
 ) -> Result<JsString> {
    // 1. Assert: Type(matched) is String.

@ -1626,9 +1628,44 @@ pub(crate) fn get_substitution(
                }
                // $<
                (Some('<'), _) => {
-                    // TODO: named capture groups
                    // 1. If namedCaptures is undefined, the replacement text is the String "$<".
-                    result.push_str("$<");
+                    // 2. Else,
+                    if named_captures.is_undefined() {
+                        result.push_str("$<")
+                    } else {
+                        // a. Assert: Type(namedCaptures) is Object.
+
+                        // b. Scan until the next > U+003E (GREATER-THAN SIGN).
+                        let mut group_name = StdString::new();
+                        let mut found = false;
+                        loop {
+                            match chars.next() {
+                                Some('>') => {
+                                    found = true;
+                                    break;
+                                }
+                                Some(c) => group_name.push(c),
+                                None => break,
+                            }
+                        }
+
+                        // c. If none is found, the replacement text is the String "$<".
+                        // d. Else,
+                        if !found {
+                            result.push_str("$<");
+                            result.push_str(&group_name);
+                        } else {
+                            // i. Let groupName be the enclosed substring.
+                            // ii. Let capture be ? Get(namedCaptures, groupName).
+                            let capture = named_captures.get_field(group_name, context)?;
+
+                            // iii. If capture is undefined, replace the text through > with the empty String.
+                            // iv. Otherwise, replace the text through > with ? ToString(capture).
+                            if !capture.is_undefined() {
+                                result.push_str(capture.to_string(context)?.as_str());
+                            }
+                        }
+                    }
                }
                // $?, ? is none of the above
                _ => {