Implement String.prototype.codePointAt (#935)

Closes #751.
4 years ago · c54c6afa6a
2 changed files with 106 additions and 23 deletions
--- a/boa/src/builtins/string/mod.rs
+++ b/boa/src/builtins/string/mod.rs
@ -22,7 +22,7 @@ use crate::{
 };
 use regress::Regex;
 use std::{
-    char::decode_utf16,
+    char::{decode_utf16, from_u32},
    cmp::{max, min},
    f64::NAN,
    string::String as StdString,
@ -50,11 +50,11 @@ pub(crate) fn code_point_at(string: RcString, position: i32) -> Option<(u32, u8,
 }
 fn is_leading_surrogate(value: u16) -> bool {
-    value >= 0xD800 && value <= 0xDBFF
+    (0xD800..=0xDBFF).contains(&value)
 }
 fn is_trailing_surrogate(value: u16) -> bool {
-    value >= 0xDC00 && value <= 0xDFFF
+    (0xDC00..=0xDFFF).contains(&value)
 }
 /// JavaScript `String` implementation.
@ -84,6 +84,7 @@ impl BuiltIn for String {
        .property("length", 0, attribute)
        .method(Self::char_at, "charAt", 1)
        .method(Self::char_code_at, "charCodeAt", 1)
        .method(Self::code_point_at, "codePointAt", 1)
        .method(Self::to_string, "toString", 0)
        .method(Self::concat, "concat", 1)
        .method(Self::repeat, "repeat", 1)
@ -197,23 +198,60 @@ impl String {
            .unwrap_or_else(Value::undefined)
            .to_integer(context)? as i32;
        // Fast path returning empty string when pos is obviously out of range
        if pos < 0 || pos >= primitive_val.len() as i32 {
            return Ok("".into());
        }
        // Calling .len() on a string would give the wrong result, as they are bytes not the number of
        // unicode code points
        // Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of
        // bytes is an O(1) operation.
-        let length = primitive_val.chars().count();
+        if let Some(utf16_val) = primitive_val.encode_utf16().nth(pos as usize) {
            Ok(Value::from(from_u32(utf16_val as u32).unwrap()))
        } else {
            Ok("".into())
        }
    }
-        // We should return an empty string is pos is out of range
+    /// `String.prototype.codePointAt( index )`
-        if pos >= length as i32 || pos < 0 {
+    ///
-            return Ok("".into());
+    /// The `codePointAt()` method returns an integer between `0` to `1114111` (`0x10FFFF`) representing the UTF-16 code unit at the given index.
    ///
    /// If no UTF-16 surrogate pair begins at the index, the code point at the index is returned.
    ///
    /// `codePointAt()` returns `undefined` if the given index is less than `0`, or if it is equal to or greater than the `length` of the string.
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-string.prototype.codepointat
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/codePointAt
    pub(crate) fn code_point_at(
        this: &Value,
        args: &[Value],
        context: &mut Context,
    ) -> Result<Value> {
        // First we get it the actual string a private field stored on the object only the context has access to.
        // Then we convert it into a Rust String by wrapping it in from_value
        let primitive_val = this.to_string(context)?;
        let pos = args
            .get(0)
            .cloned()
            .unwrap_or_else(Value::undefined)
            .to_integer(context)? as i32;
        // Fast path returning undefined when pos is obviously out of range
        if pos < 0 || pos >= primitive_val.len() as i32 {
            return Ok(Value::undefined());
        }
-        Ok(Value::from(
+        if let Some((code_point, _, _)) = code_point_at(primitive_val, pos) {
-            primitive_val
+            Ok(Value::from(code_point))
-                .chars()
+        } else {
-                .nth(pos as usize)
+            Ok(Value::undefined())
-                .expect("failed to get value"),
+        }
        ))
    }
    /// `String.prototype.charCodeAt( index )`
@ -238,26 +276,25 @@ impl String {
        // First we get it the actual string a private field stored on the object only the context has access to.
        // Then we convert it into a Rust String by wrapping it in from_value
        let primitive_val = this.to_string(context)?;
        // Calling .len() on a string would give the wrong result, as they are bytes not the number of unicode code points
        // Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of bytes is an O(1) operation.
        let length = primitive_val.chars().count();
        let pos = args
            .get(0)
            .cloned()
            .unwrap_or_else(Value::undefined)
            .to_integer(context)? as i32;
-        if pos >= length as i32 || pos < 0 {
+        // Fast path returning NaN when pos is obviously out of range
        if pos < 0 || pos >= primitive_val.len() as i32 {
            return Ok(Value::from(NAN));
        }
-        let utf16_val = primitive_val
+        // Calling .len() on a string would give the wrong result, as they are bytes not the number of unicode code points
-            .encode_utf16()
+        // Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of bytes is an O(1) operation.
            .nth(pos as usize)
            .expect("failed to get utf16 value");
        // If there is no element at that index, the result is NaN
        if let Some(utf16_val) = primitive_val.encode_utf16().nth(pos as usize) {
            Ok(Value::from(f64::from(utf16_val)))
        } else {
            Ok(Value::from(NAN))
        }
    }
    /// `String.prototype.concat( str1[, ...strN] )`
--- a/boa/src/builtins/string/tests.rs
+++ b/boa/src/builtins/string/tests.rs
@ -775,19 +775,65 @@ fn last_index_non_integer_position_argument() {
 #[test]
 fn char_at() {
    let mut context = Context::new();
    assert_eq!(forward(&mut context, "'abc'.charAt(-1)"), "\"\"");
    assert_eq!(forward(&mut context, "'abc'.charAt(1)"), "\"b\"");
    assert_eq!(forward(&mut context, "'abc'.charAt(9)"), "\"\"");
    assert_eq!(forward(&mut context, "'abc'.charAt()"), "\"a\"");
    assert_eq!(forward(&mut context, "'abc'.charAt(null)"), "\"a\"");
    assert_eq!(forward(&mut context, "'\\uDBFF'.charAt(0)"), "\"\u{FFFD}\"");
 }
 #[test]
 fn char_code_at() {
    let mut context = Context::new();
    assert_eq!(forward(&mut context, "'abc'.charCodeAt(-1)"), "NaN");
    assert_eq!(forward(&mut context, "'abc'.charCodeAt(1)"), "98");
    assert_eq!(forward(&mut context, "'abc'.charCodeAt(9)"), "NaN");
    assert_eq!(forward(&mut context, "'abc'.charCodeAt()"), "97");
    assert_eq!(forward(&mut context, "'abc'.charCodeAt(null)"), "97");
    assert_eq!(forward(&mut context, "'\\uFFFF'.charCodeAt(0)"), "65535");
 }
 #[test]
 fn code_point_at() {
    let mut context = Context::new();
    assert_eq!(forward(&mut context, "'abc'.codePointAt(-1)"), "undefined");
    assert_eq!(forward(&mut context, "'abc'.codePointAt(1)"), "98");
    assert_eq!(forward(&mut context, "'abc'.codePointAt(9)"), "undefined");
    assert_eq!(forward(&mut context, "'abc'.codePointAt()"), "97");
    assert_eq!(forward(&mut context, "'abc'.codePointAt(null)"), "97");
    assert_eq!(
        forward(&mut context, "'\\uD800\\uDC00'.codePointAt(0)"),
        "65536"
    );
    assert_eq!(
        forward(&mut context, "'\\uD800\\uDFFF'.codePointAt(0)"),
        "66559"
    );
    assert_eq!(
        forward(&mut context, "'\\uDBFF\\uDC00'.codePointAt(0)"),
        "1113088"
    );
    assert_eq!(
        forward(&mut context, "'\\uDBFF\\uDFFF'.codePointAt(0)"),
        "1114111"
    );
    assert_eq!(
        forward(&mut context, "'\\uD800\\uDC00'.codePointAt(1)"),
        "56320"
    );
    assert_eq!(
        forward(&mut context, "'\\uD800\\uDFFF'.codePointAt(1)"),
        "57343"
    );
    assert_eq!(
        forward(&mut context, "'\\uDBFF\\uDC00'.codePointAt(1)"),
        "56320"
    );
    assert_eq!(
        forward(&mut context, "'\\uDBFF\\uDFFF'.codePointAt(1)"),
        "57343"
    );
 }
 #[test]