Add URI encoding and decoding functions (#2267)

This Pull Request closes #894. It changes the following: - Adds the `encodeURI()`, `decodeURI()`, `encodeURIComponent()` and `decodeURIComponent()` functions - Passes all the tests except for those depending on #1987 or on the comment below. Things to discuss: - I'm unable to find in the spec information regarding the only failing tests, which relate to [this](f1870753fa/test/built-ins/encodeURI/S15.1.3.3_A1.1_T2.js): > If string.charAt(k) in [0xDC00 - 0xDFFF], throw URIError Let me know your thoughts :) Co-authored-by: raskad <32105367+raskad@users.noreply.github.com>
2 years ago · 779384d87f
7 changed files with 779 additions and 24 deletions
--- a/boa_engine/src/builtins/mod.rs
+++ b/boa_engine/src/builtins/mod.rs
@ -32,6 +32,7 @@ pub mod string;
 pub mod symbol;
 pub mod typed_array;
 pub mod undefined;
 pub mod uri;
 #[cfg(feature = "console")]
 pub mod console;
@ -81,7 +82,7 @@ use crate::{
    builtins::{
        array_buffer::ArrayBuffer, async_generator::AsyncGenerator,
        async_generator_function::AsyncGeneratorFunction, generator::Generator,
-        generator_function::GeneratorFunction, typed_array::TypedArray,
+        generator_function::GeneratorFunction, typed_array::TypedArray, uri::Uri,
    },
    property::{Attribute, PropertyDescriptor},
    Context, JsValue,
@ -193,7 +194,8 @@ pub fn init(context: &mut Context) {
        Promise,
        AsyncFunction,
        AsyncGenerator,
-        AsyncGeneratorFunction
+        AsyncGeneratorFunction,
        Uri
    };
    #[cfg(feature = "intl")]
--- a/boa_engine/src/builtins/regexp/mod.rs
+++ b/boa_engine/src/builtins/regexp/mod.rs
@ -1745,7 +1745,8 @@ fn advance_string_index(s: &JsString, index: u64, unicode: bool) -> u64 {
    }
    // 5. Let cp be ! CodePointAt(S, index).
-    let (_, offset, _) = crate::builtins::string::code_point_at(s, index);
+    let cp = crate::builtins::string::code_point_at(s, index);
-    index + u64::from(offset)
+    // 6. Return index + cp.[[CodeUnitCount]].
    index + u64::from(cp.code_unit_count)
 }
--- a/boa_engine/src/builtins/string/mod.rs
+++ b/boa_engine/src/builtins/string/mod.rs
@ -40,29 +40,87 @@ pub(crate) enum Placement {
    End,
 }
-pub(crate) fn code_point_at(string: &JsString, position: u64) -> (u32, u8, bool) {
+/// Code point information for the `CodePointAt` abstract operation.
 #[derive(Debug, Clone, Copy)]
 pub(crate) struct CodePointInfo {
    pub(crate) code_point: u32,
    pub(crate) code_unit_count: u8,
    pub(crate) is_unpaired_surrogate: bool,
 }
 /// The `CodePointAt ( string, position )` abstract operation.
 ///
 /// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a
 /// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point),
 /// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It
 /// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads
 /// from it a single code point starting with the code unit at index `position`.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#sec-codepointat
 pub(crate) fn code_point_at(string: &JsString, position: u64) -> CodePointInfo {
    let mut encoded = string.encode_utf16();
    // 1. Let size be the length of string.
    let size = encoded.clone().count() as u64;
    // 2. Assert: position ≥ 0 and position < size.
    assert!(position < size);
    // 3. Let first be the code unit at index position within string.
    let first = encoded
        .nth(position as usize)
        .expect("The callers of this function must've already checked bounds.");
    // 4. Let cp be the code point whose numeric value is that of first.
    let cp = u32::from(first);
    // 5. If first is not a leading surrogate or trailing surrogate, then
    if !is_leading_surrogate(first) && !is_trailing_surrogate(first) {
-        return (u32::from(first), 1, false);
+        // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }.
        return CodePointInfo {
            code_point: cp,
            code_unit_count: 1,
            is_unpaired_surrogate: false,
        };
    }
    // 6. If first is a trailing surrogate or position + 1 = size, then
    if is_trailing_surrogate(first) || position + 1 == size {
-        return (u32::from(first), 1, true);
+        // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
        return CodePointInfo {
            code_point: cp,
            code_unit_count: 1,
            is_unpaired_surrogate: true,
        };
    }
    // 7. Let second be the code unit at index position + 1 within string.
    let second = encoded
        .next()
        .expect("The callers of this function must've already checked bounds.");
    // 8. If second is not a trailing surrogate, then
    if !is_trailing_surrogate(second) {
-        return (u32::from(first), 1, true);
+        // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
        return CodePointInfo {
            code_point: cp,
            code_unit_count: 1,
            is_unpaired_surrogate: true,
        };
    }
    // 9. Set cp to UTF16SurrogatePairToCodePoint(first, second).
    let cp = (u32::from(first) - 0xD800) * 0x400 + (u32::from(second) - 0xDC00) + 0x10000;
-    (cp, 2, false)
+
    // 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }.
    CodePointInfo {
        code_point: cp,
        code_unit_count: 2,
        is_unpaired_surrogate: false,
    }
 }
 /// Helper function to check if a `char` is trimmable.
@ -86,10 +144,22 @@ pub(crate) fn is_trimmable_whitespace(c: char) -> bool {
    )
 }
 /// Checks if the given code unit is a leading surrogate.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#leading-surrogate
 pub(crate) fn is_leading_surrogate(value: u16) -> bool {
    (0xD800..=0xDBFF).contains(&value)
 }
 /// Checks if the given code unit is a trailing surrogate.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#trailing-surrogate
 pub(crate) fn is_trailing_surrogate(value: u16) -> bool {
    (0xDC00..=0xDFFF).contains(&value)
 }
@ -369,7 +439,7 @@ impl String {
        }
    }
-    /// `String.fromCharCode(...codePoints)`
+    /// `String.fromCharCode(...codeUnits)`
    ///
    /// Construct a `String` from one or more code points (as numbers).
    /// More information:
@ -381,21 +451,22 @@ impl String {
        args: &[JsValue],
        context: &mut Context,
    ) -> JsResult<JsValue> {
-        // 1. Let length be the number of elements in codeUnits.
+        // 1. Let result be the empty String.
-        // 2. Let elements be a new empty List.
+        let mut result = Vec::new();
-        let mut elements = Vec::new();
+
-        // 3. For each element next of codeUnits, do
+        // 2. For each element next of codeUnits, do
        for next in args {
-            // 3a. Let nextCU be ℝ(? ToUint16(next)).
+            // a. Let nextCU be the code unit whose numeric value is ℝ(? ToUint16(next)).
-            // 3b. Append nextCU to the end of elements.
+            let next_cu = next.to_uint16(context)?;
            elements.push(next.to_uint16(context)?);
        }
-        // 4. Return the String value whose code units are the elements in the List elements.
+            // b. Set result to the string-concatenation of result and nextCU.
-        //    If codeUnits is empty, the empty String is returned.
+            result.push(next_cu);
        }
-        let s = std::string::String::from_utf16_lossy(elements.as_slice());
+        // 3. Return result.
-        Ok(JsValue::String(JsString::new(s)))
+        Ok(JsValue::String(JsString::new(
            std::string::String::from_utf16_lossy(&result),
        )))
    }
    /// `String.prototype.toString ( )`
@ -544,7 +615,7 @@ impl String {
            IntegerOrInfinity::Integer(position) if (0..size).contains(&position) => {
                // 6. Let cp be ! CodePointAt(S, position).
                // 7. Return 𝔽(cp.[[CodePoint]]).
-                Ok(code_point_at(&string, position as u64).0.into())
+                Ok(code_point_at(&string, position as u64).code_point.into())
            }
            // 5. If position < 0 or position ≥ size, return undefined.
            _ => Ok(JsValue::undefined()),
--- a/boa_engine/src/builtins/string/string_iterator.rs
+++ b/boa_engine/src/builtins/string/string_iterator.rs
@ -10,6 +10,8 @@ use crate::{
 use boa_gc::{Finalize, Trace};
 use boa_profiler::Profiler;
 use super::CodePointInfo;
 #[derive(Debug, Clone, Finalize, Trace)]
 pub struct StringIterator {
    string: JsValue,
@ -61,7 +63,11 @@ impl StringIterator {
                context,
            ));
        }
-        let (_, code_unit_count, _) = code_point_at(&native_string, position as u64);
+        let CodePointInfo {
            code_point: _,
            code_unit_count,
            is_unpaired_surrogate: _,
        } = code_point_at(&native_string, position as u64);
        string_iterator.next_index += i32::from(code_unit_count);
        let result_string = crate::builtins::string::String::substring(
            &string_iterator.string,
--- a/boa_engine/src/builtins/string/tests.rs
+++ b/boa_engine/src/builtins/string/tests.rs
@ -1,3 +1,4 @@
 use super::{is_leading_surrogate, is_trailing_surrogate};
 use crate::{forward, forward_val, Context};
 #[test]
@ -1150,3 +1151,17 @@ fn search() {
    assert_eq!(forward(&mut context, "'aa'.search(/a/g)"), "0");
    assert_eq!(forward(&mut context, "'ba'.search(/a/)"), "1");
 }
 #[test]
 fn ut_is_leading_surrogate() {
    for cp in 0xD800..=0xDBFF {
        assert!(is_leading_surrogate(cp), "failed: {cp:X}");
    }
 }
 #[test]
 fn ut_is_trailing_surrogate() {
    for cp in 0xDC00..=0xDFFF {
        assert!(is_trailing_surrogate(cp), "failed: {cp:X}");
    }
 }
--- a/boa_engine/src/builtins/uri/consts.rs
+++ b/boa_engine/src/builtins/uri/consts.rs
@ -0,0 +1,110 @@
 //! URI handling function constants
 //!
 //! This module contains a few constants used to handle decoding and encoding for URI handling
 //! functions. They make it easier and more performant to compare different ranges and code points.
 use std::ops::RangeInclusive;
 /// A range containing all the lowercase `uriAlpha` code points.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-uriAlpha
 const URI_ALPHA_LOWER: RangeInclusive<u16> = b'a' as u16..=b'z' as u16;
 /// A range containing all the uppercase `uriAlpha` code points.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-uriAlpha
 const URI_ALPHA_UPPER: RangeInclusive<u16> = b'A' as u16..=b'Z' as u16;
 /// A range containing all the `DecimalDigit` code points.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-DecimalDigit
 const DECIMAL_DIGIT: RangeInclusive<u16> = b'0' as u16..=b'9' as u16;
 /// An array containing all the `uriMark` code points.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-uriMark
 const URI_MARK: [u16; 9] = [
    b'-' as u16,
    b'_' as u16,
    b'.' as u16,
    b'!' as u16,
    b'~' as u16,
    b'*' as u16,
    b'\'' as u16,
    b'(' as u16,
    b')' as u16,
 ];
 /// An array containing all the `uriReserved` code points.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-uriReserved
 const URI_RESERVED: [u16; 10] = [
    b';' as u16,
    b'/' as u16,
    b'?' as u16,
    b':' as u16,
    b'@' as u16,
    b'&' as u16,
    b'=' as u16,
    b'+' as u16,
    b'$' as u16,
    b',' as u16,
 ];
 /// The number sign (`#`) symbol as a UTF-16 code potint.
 const NUMBER_SIGN: u16 = b'#' as u16;
 /// Constant with all the unescaped URI characters.
 ///
 /// Contains `uriAlpha`, `DecimalDigit` and `uriMark`.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped
 #[inline]
 pub(super) fn is_uri_unescaped(code_point: u16) -> bool {
    URI_ALPHA_LOWER.contains(&code_point)
        || URI_ALPHA_UPPER.contains(&code_point)
        || DECIMAL_DIGIT.contains(&code_point)
        || URI_MARK.contains(&code_point)
 }
 /// Constant with all the reserved URI characters, plus the number sign symbol (`#`).
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#prod-uriReserved
 #[inline]
 pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool {
    code_point == NUMBER_SIGN || URI_RESERVED.contains(&code_point)
 }
 /// Constant with all the reserved and unescaped URI characters, plus the number sign symbol (`#`).
 ///
 /// More information:
 ///  - [`uriReserved` in ECMAScript spec][uri_reserved]
 ///  - [`uriUnescaped` in ECMAScript spec][uri_unescaped]
 ///
 /// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved
 /// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped
 #[inline]
 pub(super) fn is_uri_reserved_or_uri_unescaped_or_number_sign(code_point: u16) -> bool {
    code_point == NUMBER_SIGN || is_uri_unescaped(code_point) || URI_RESERVED.contains(&code_point)
 }
--- a/boa_engine/src/builtins/uri/mod.rs
+++ b/boa_engine/src/builtins/uri/mod.rs
@ -0,0 +1,550 @@
 //! URI Handling Functions
 //!
 //! Uniform Resource Identifiers, or URIs, are Strings that identify resources (e.g. web pages or
 //! files) and transport protocols by which to access them (e.g. HTTP or FTP) on the Internet. The
 //! ECMAScript language itself does not provide any support for using URIs except for functions
 //! that encode and decode URIs as described in 19.2.6.2, 19.2.6.3, 19.2.6.4 and 19.2.6.5
 //!
 //! More information:
 //!  - [ECMAScript reference][spec]
 //!
 //! [spec]: https://tc39.es/ecma262/#sec-uri-handling-functions
 mod consts;
 use self::consts::{
    is_uri_reserved_or_number_sign, is_uri_reserved_or_uri_unescaped_or_number_sign,
    is_uri_unescaped,
 };
 use super::{string::code_point_at, BuiltIn};
 use crate::{
    builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString,
    JsValue,
 };
 /// URI Handling Functions
 #[derive(Debug, Clone, Copy)]
 pub(crate) struct Uri;
 impl BuiltIn for Uri {
    const NAME: &'static str = "Uri";
    fn init(context: &mut Context) -> Option<JsValue> {
        let decode_uri = FunctionBuilder::native(context, Self::decode_uri)
            .name("decodeURI")
            .length(1)
            .constructor(false)
            .build();
        context.register_global_property(
            "decodeURI",
            decode_uri,
            Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE,
        );
        let decode_uri_component = FunctionBuilder::native(context, Self::decode_uri_component)
            .name("decodeURIComponent")
            .length(1)
            .constructor(false)
            .build();
        context.register_global_property(
            "decodeURIComponent",
            decode_uri_component,
            Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE,
        );
        let encode_uri = FunctionBuilder::native(context, Self::encode_uri)
            .name("encodeURI")
            .length(1)
            .constructor(false)
            .build();
        context.register_global_property(
            "encodeURI",
            encode_uri,
            Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE,
        );
        let encode_uri_component = FunctionBuilder::native(context, Self::encode_uri_component)
            .name("encodeURIComponent")
            .length(1)
            .constructor(false)
            .build();
        context.register_global_property(
            "encodeURIComponent",
            encode_uri_component,
            Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE,
        );
        None
    }
 }
 impl Uri {
    /// Builtin JavaScript `decodeURI ( encodedURI )` function.
    ///
    /// This function computes a new version of a URI in which each escape sequence and UTF-8
    /// encoding of the sort that might be introduced by the `encodeURI` function is replaced with
    /// the UTF-16 encoding of the code points that it represents. Escape sequences that could not
    /// have been introduced by `encodeURI` are not replaced.
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-decodeuri-encodeduri
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI
    pub(crate) fn decode_uri(
        _: &JsValue,
        args: &[JsValue],
        context: &mut Context,
    ) -> JsResult<JsValue> {
        let encoded_uri = args.get_or_undefined(0);
        // 1. Let uriString be ? ToString(encodedURI).
        let uri_string = encoded_uri.to_string(context)?;
        // 2. Let reservedURISet be a String containing one instance of each code unit valid in uriReserved plus "#".
        let reserved_uri_set = is_uri_reserved_or_number_sign;
        // 3. Return ? Decode(uriString, reservedURISet).
        Ok(JsValue::from(decode(
            context,
            &uri_string,
            reserved_uri_set,
        )?))
    }
    /// Builtin JavaScript `decodeURIComponent ( encodedURIComponent )` function.
    ///
    /// This function computes a new version of a URI in which each escape sequence and UTF-8
    /// encoding of the sort that might be introduced by the `encodeURIComponent` function is
    /// replaced with the UTF-16 encoding of the code points that it represents.
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-decodeuricomponent-encodeduricomponent
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent
    pub(crate) fn decode_uri_component(
        _: &JsValue,
        args: &[JsValue],
        context: &mut Context,
    ) -> JsResult<JsValue> {
        let encoded_uri_component = args.get_or_undefined(0);
        // 1. Let componentString be ? ToString(encodedURIComponent).
        let component_string = encoded_uri_component.to_string(context)?;
        // 2. Let reservedURIComponentSet be the empty String.
        let reserved_uri_component_set = |_: u16| false;
        // 3. Return ? Decode(componentString, reservedURIComponentSet).
        Ok(JsValue::from(decode(
            context,
            &component_string,
            reserved_uri_component_set,
        )?))
    }
    /// Builtin JavaScript `encodeURI ( uri )` function.
    ///
    /// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance
    /// of certain code points is replaced by one, two, three, or four escape sequences
    /// representing the UTF-8 encoding of the code points.
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-encodeuri-uri
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
    pub(crate) fn encode_uri(
        _: &JsValue,
        args: &[JsValue],
        context: &mut Context,
    ) -> JsResult<JsValue> {
        let uri = args.get_or_undefined(0);
        // 1. Let uriString be ? ToString(uri).
        let uri_string = uri.to_string(context)?;
        // 2. Let unescapedURISet be a String containing one instance of each code unit valid in uriReserved and uriUnescaped plus "#".
        let unescaped_uri_set = is_uri_reserved_or_uri_unescaped_or_number_sign;
        // 3. Return ? Encode(uriString, unescapedURISet).
        Ok(JsValue::from(encode(
            context,
            &uri_string,
            unescaped_uri_set,
        )?))
    }
    /// Builtin JavaScript `encodeURIComponent ( uriComponent )` function.
    ///
    /// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance
    /// of certain code points is replaced by one, two, three, or four escape sequences
    /// representing the UTF-8 encoding of the code point.
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent
    pub(crate) fn encode_uri_component(
        _: &JsValue,
        args: &[JsValue],
        context: &mut Context,
    ) -> JsResult<JsValue> {
        let uri_component = args.get_or_undefined(0);
        // 1. Let componentString be ? ToString(uriComponent).
        let component_string = uri_component.to_string(context)?;
        // 2. Let unescapedURIComponentSet be a String containing one instance of each code unit valid in uriUnescaped.
        let unescaped_uri_component_set = is_uri_unescaped;
        // 3. Return ? Encode(componentString, unescapedURIComponentSet).
        Ok(JsValue::from(encode(
            context,
            &component_string,
            unescaped_uri_component_set,
        )?))
    }
 }
 /// The `Encode ( string, unescapedSet )` abstract operation
 ///
 /// The abstract operation Encode takes arguments `string` (a String) and `unescapedSet` (a String)
 /// and returns either a normal completion containing a String or a throw completion. It performs
 /// URI encoding and escaping.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#sec-encode
 fn encode<F>(context: &mut Context, string: &JsString, unescaped_set: F) -> JsResult<String>
 where
    F: Fn(u16) -> bool,
 {
    let code_units = string.encode_utf16().collect::<Vec<_>>();
    // 1. Let strLen be the length of string.
    let str_len = code_units.len();
    // 2. Let R be the empty String.
    let mut r = String::new();
    // 3. Let k be 0.
    let mut k = 0;
    // 4. Repeat,
    loop {
        // a. If k = strLen, return R.
        if k == str_len {
            return Ok(r);
        }
        // b. Let C be the code unit at index k within string.
        let c = code_units[k];
        // c. If C is in unescapedSet, then
        if unescaped_set(c) {
            // i. Set k to k + 1.
            k += 1;
            // ii. Set R to the string-concatenation of R and C.
            r.push(char::from_u32(u32::from(c)).expect("char from code point cannot fail here"));
        } else {
            // d. Else,
            // i. Let cp be CodePointAt(string, k).
            let cp = code_point_at(string, k as u64);
            // ii. If cp.[[IsUnpairedSurrogate]] is true, throw a URIError exception.
            if cp.is_unpaired_surrogate {
                context.throw_uri_error("trying to encode an invalid string")?;
            }
            // iii. Set k to k + cp.[[CodeUnitCount]].
            k += cp.code_unit_count as usize;
            // iv. Let Octets be the List of octets resulting by applying the UTF-8 transformation
            //     to cp.[[CodePoint]].
            let mut buff = [0_u8; 4]; // Will never be more than 4 bytes
            let octets = char::from_u32(cp.code_point)
                .expect("valid unicode code point to char conversion failed")
                .encode_utf8(&mut buff);
            // v. For each element octet of Octets, do
            for octet in octets.bytes() {
                // 1. Set R to the string-concatenation of:
                //    R
                //    "%"
                //    the String representation of octet, formatted as a two-digit uppercase
                //    hexadecimal number, padded to the left with a zero if necessary
                r = format!("{r}%{octet:0>2X}");
            }
        }
    }
 }
 /// The `Decode ( string, reservedSet )` abstract operation.
 ///
 /// The abstract operation Decode takes arguments `string` (a String) and `reservedSet` (a String)
 /// and returns either a normal completion containing a String or a throw completion. It performs
 /// URI unescaping and decoding.
 ///
 /// More information:
 ///  - [ECMAScript reference][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#sec-decode
 #[allow(clippy::many_single_char_names)]
 fn decode<F>(context: &mut Context, string: &JsString, reserved_set: F) -> JsResult<String>
 where
    F: Fn(u16) -> bool,
 {
    let code_units = string.encode_utf16().collect::<Vec<_>>();
    // 1. Let strLen be the length of string.
    let str_len = code_units.len();
    // 2. Let R be the empty String.
    let mut r = Vec::new();
    // 3. Let k be 0.
    let mut k = 0;
    // 4. Repeat,
    loop {
        // a. If k = strLen, return R.
        if k == str_len {
            return Ok(String::from_utf16(&r).expect("invalid UTF-16 characters found"));
        }
        // b. Let C be the code unit at index k within string.
        let c = code_units[k];
        // c. If C is not the code unit 0x0025 (PERCENT SIGN), then
        #[allow(clippy::if_not_else)]
        let s = if c != 0x0025_u16 {
            // i. Let S be the String value containing only the code unit C.
            Vec::from([c])
        } else {
            // d. Else,
            // i. Let start be k.
            let start = k;
            // ii. If k + 2 ≥ strLen, throw a URIError exception.
            if k + 2 >= str_len {
                context.throw_uri_error("invalid escape character found")?;
            }
            // iii. If the code units at index (k + 1) and (k + 2) within string do not represent
            // hexadecimal digits, throw a URIError exception.
            // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2).
            let b = decode_hex_byte(code_units[k + 1], code_units[k + 2])
                .ok_or_else(|| context.construct_uri_error("invalid hexadecimal digit found"))?;
            // v. Set k to k + 2.
            k += 2;
            // vi. Let n be the number of leading 1 bits in B.
            let n = leading_one_bits(b);
            // vii. If n = 0, then
            if n == 0 {
                // 1. Let C be the code unit whose value is B.
                let c = u16::from(b);
                // 2. If C is not in reservedSet, then
                if !reserved_set(c) {
                    // a. Let S be the String value containing only the code unit C.
                    Vec::from([c])
                } else {
                    // 3. Else,
                    // a. Let S be the substring of string from start to k + 1.
                    Vec::from(&code_units[start..=k])
                }
            } else {
                // viii. Else,
                // 1. If n = 1 or n > 4, throw a URIError exception.
                if n == 1 || n > 4 {
                    context.throw_uri_error("invalid escaped character found")?;
                }
                // 2. If k + (3 × (n - 1)) ≥ strLen, throw a URIError exception.
                if k + (3 * (n - 1)) > str_len {
                    context.throw_uri_error("non-terminated escape character found")?;
                }
                // 3. Let Octets be « B ».
                let mut octets = Vec::from([b]);
                // 4. Let j be 1.
                // 5. Repeat, while j < n,
                for _j in 1..n {
                    // a. Set k to k + 1.
                    k += 1;
                    // b. If the code unit at index k within string is not the code unit 0x0025 (PERCENT SIGN), throw a URIError exception.
                    if code_units[k] != 0x0025 {
                        context
                            .throw_uri_error("escape characters must be preceded with a % sign")?;
                    }
                    // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception.
                    // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2).
                    let b =
                        decode_hex_byte(code_units[k + 1], code_units[k + 2]).ok_or_else(|| {
                            context.construct_uri_error("invalid hexadecimal digit found")
                        })?;
                    // e. Set k to k + 2.
                    k += 2;
                    // f. Append B to Octets.
                    octets.push(b);
                    // g. Set j to j + 1.
                }
                // 6. Assert: The length of Octets is n.
                assert_eq!(octets.len(), n);
                // 7. If Octets does not contain a valid UTF-8 encoding of a Unicode code point, throw a URIError exception.
                match String::from_utf8(octets) {
                    Err(_) => {
                        return Err(context.construct_uri_error("invalid UTF-8 encoding found"))
                    }
                    Ok(v) => {
                        // 8. Let V be the code point obtained by applying the UTF-8 transformation to Octets, that is, from a List of octets into a 21-bit value.
                        // 9. Let S be UTF16EncodeCodePoint(V).
                        // utf16_encode_codepoint(v)
                        v.encode_utf16().collect::<Vec<_>>()
                    }
                }
            }
        };
        // e. Set R to the string-concatenation of R and S.
        r.extend_from_slice(&s);
        // f. Set k to k + 1.
        k += 1;
    }
 }
 /// Decodes a byte from two unicode code units.
 fn decode_hex_byte(high: u16, low: u16) -> Option<u8> {
    match (
        char::from_u32(u32::from(high)),
        char::from_u32(u32::from(low)),
    ) {
        (Some(high), Some(low)) => match (high.to_digit(16), low.to_digit(16)) {
            (Some(high), Some(low)) => Some(((high as u8) << 4) + low as u8),
            _ => None,
        },
        _ => None,
    }
 }
 /// Counts the number of leading 1 bits in a given byte.
 #[inline]
 fn leading_one_bits(byte: u8) -> usize {
    // This uses a value table for speed
    if byte == u8::MAX {
        8
    } else if byte == 0b1111_1110 {
        7
    } else if byte & 0b1111_1100 == 0b1111_1100 {
        6
    } else if byte & 0b1111_1000 == 0b1111_1000 {
        5
    } else if byte & 0b1111_0000 == 0b1111_0000 {
        4
    } else if byte & 0b1110_0000 == 0b1110_0000 {
        3
    } else if byte & 0b1100_0000 == 0b1100_0000 {
        2
    } else if byte & 0b1000_0000 == 0b1000_0000 {
        1
    } else {
        0
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Checks if the `leading_one_bits()` function works as expected.
    #[test]
    fn ut_leading_one_bits() {
        assert_eq!(leading_one_bits(0b1111_1111), 8);
        assert_eq!(leading_one_bits(0b1111_1110), 7);
        assert_eq!(leading_one_bits(0b1111_1100), 6);
        assert_eq!(leading_one_bits(0b1111_1101), 6);
        assert_eq!(leading_one_bits(0b1111_1011), 5);
        assert_eq!(leading_one_bits(0b1111_1000), 5);
        assert_eq!(leading_one_bits(0b1111_0000), 4);
        assert_eq!(leading_one_bits(0b1111_0111), 4);
        assert_eq!(leading_one_bits(0b1110_0000), 3);
        assert_eq!(leading_one_bits(0b1110_1111), 3);
        assert_eq!(leading_one_bits(0b1100_0000), 2);
        assert_eq!(leading_one_bits(0b1101_1111), 2);
        assert_eq!(leading_one_bits(0b1000_0000), 1);
        assert_eq!(leading_one_bits(0b1011_1111), 1);
        assert_eq!(leading_one_bits(0b0000_0000), 0);
        assert_eq!(leading_one_bits(0b0111_1111), 0);
    }
    /// Checks that the `decode_byte()` function works as expected.
    #[test]
    fn ut_decode_byte() {
        // Sunny day tests
        assert_eq!(
            decode_hex_byte(u16::from(b'2'), u16::from(b'0')).unwrap(),
            0x20
        );
        assert_eq!(
            decode_hex_byte(u16::from(b'2'), u16::from(b'A')).unwrap(),
            0x2A
        );
        assert_eq!(
            decode_hex_byte(u16::from(b'3'), u16::from(b'C')).unwrap(),
            0x3C
        );
        assert_eq!(
            decode_hex_byte(u16::from(b'4'), u16::from(b'0')).unwrap(),
            0x40
        );
        assert_eq!(
            decode_hex_byte(u16::from(b'7'), u16::from(b'E')).unwrap(),
            0x7E
        );
        assert_eq!(
            decode_hex_byte(u16::from(b'0'), u16::from(b'0')).unwrap(),
            0x00
        );
        // Rainy day tests
        assert!(decode_hex_byte(u16::from(b'-'), u16::from(b'0')).is_none());
        assert!(decode_hex_byte(u16::from(b'f'), u16::from(b'~')).is_none());
        assert!(decode_hex_byte(u16::from(b'A'), 0_u16).is_none());
        assert!(decode_hex_byte(u16::from(b'%'), u16::from(b'&')).is_none());
        assert!(decode_hex_byte(0xFACD_u16, u16::from(b'-')).is_none());
        assert!(decode_hex_byte(u16::from(b'-'), 0xA0FD_u16).is_none());
    }
 }