Implement `String.prototype.toLocaleUpper/LowerCase` (#2822)

This fixes some more ES5 tests that were failing because the functions haven't been implemented. It changes the following: - Adds `String::to_locale_case`, which uses ICU4X to convert strings to uppercase or lowercase. - Refactors `String::to_uppercase` and `String::to_lowercase` into a single `String::to_case` which uses a const generic to distinguish each case. - Adds utility functions on `JsString` to avoid code repetition.
2 years ago · a12f10e335
6 changed files with 166 additions and 91 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -406,6 +406,7 @@ dependencies = [
 "fast-float",
 "float-cmp",
 "icu_calendar",
 "icu_casemapping",
 "icu_collator",
 "icu_datetime",
 "icu_list",
--- a/boa_engine/Cargo.toml
+++ b/boa_engine/Cargo.toml
@ -23,6 +23,7 @@ intl = [
    "dep:icu_provider",
    "dep:icu_calendar",
    "dep:icu_collator",
    "dep:icu_casemapping",
    "dep:icu_list",
    "dep:writeable",
    "dep:sys-locale",
@ -83,6 +84,7 @@ icu_collator = { version = "1.1.0", features = ["serde"], optional = true }
 icu_plurals = { version = "1.1.0", features = ["serde"], optional = true }
 icu_provider = { version = "1.1.0", optional = true }
 icu_list = { version = "1.1.0", features = ["serde"], optional = true }
 icu_casemapping = { version = "0.7.1", features = ["serde"], optional = true}
 writeable = { version = "0.5.2", optional = true }
 sys-locale = { version = "0.3.0", optional = true }
--- a/boa_engine/src/builtins/string/mod.rs
+++ b/boa_engine/src/builtins/string/mod.rs
@ -122,8 +122,10 @@ impl IntrinsicObject for String {
            .method(Self::pad_end, "padEnd", 1)
            .method(Self::pad_start, "padStart", 1)
            .method(Self::trim, "trim", 0)
-            .method(Self::to_lowercase, "toLowerCase", 0)
+            .method(Self::to_case::<false>, "toLowerCase", 0)
-            .method(Self::to_uppercase, "toUpperCase", 0)
+            .method(Self::to_case::<true>, "toUpperCase", 0)
            .method(Self::to_locale_case::<false>, "toLocaleLowerCase", 0)
            .method(Self::to_locale_case::<true>, "toLocaleUpperCase", 0)
            .method(Self::substring, "substring", 2)
            .method(Self::split, "split", 2)
            .method(Self::value_of, "valueOf", 0)
@ -1644,18 +1646,19 @@ impl String {
        Ok(js_string!(string.trim_end()).into())
    }
-    /// `String.prototype.toLowerCase()`
+    /// [`String.prototype.toUpperCase()`][upper] and [`String.prototype.toLowerCase()`][lower]
    ///
-    /// The `toLowerCase()` method returns the calling string value converted to lower case.
+    /// The case methods return the calling string value converted to uppercase or lowercase.
    ///
    /// The value will be **converted** to a string if it isn't one.
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
-    /// [spec]: https://tc39.es/ecma262/#sec-string.prototype.tolowercase
+    /// [upper]: https://tc39.es/ecma262/#sec-string.prototype.toUppercase
-    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toLowerCase
+    /// [lower]: https://tc39.es/ecma262/#sec-string.prototype.toLowercase
-    #[allow(clippy::wrong_self_convention)]
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase
-    pub(crate) fn to_lowercase(
+    pub(crate) fn to_case<const UPPER: bool>(
        this: &JsValue,
        _: &[JsValue],
        context: &mut Context<'_>,
@ -1666,101 +1669,95 @@ impl String {
        // 2. Let S be ? ToString(O).
        let string = this.to_string(context)?;
        let mut code_points = string.code_points();
        let mut lower_text = Vec::with_capacity(string.len());
        let mut next_unpaired_surrogate = None;
        // 3. Let sText be ! StringToCodePoints(S).
-        // 4. Let lowerText be the result of toLowercase(sText), according to
+        // 4. Let upperText be the result of toUppercase(sText), according to
        // the Unicode Default Case Conversion algorithm.
-        loop {
+        let text = string.map_valid_segments(|s| {
-            let only_chars = code_points
+            if UPPER {
-                .by_ref()
+                s.to_uppercase()
                .map_while(|cpoint| match cpoint {
                    CodePoint::Unicode(c) => Some(c),
                    CodePoint::UnpairedSurrogate(s) => {
                        next_unpaired_surrogate = Some(s);
                        None
                    }
                })
                .collect::<std::string::String>()
                .to_lowercase();
            lower_text.extend(only_chars.encode_utf16());
            if let Some(surr) = next_unpaired_surrogate.take() {
                lower_text.push(surr);
            } else {
-                break;
+                s.to_lowercase()
            }
-        }
+        });
-        // 5. Let L be ! CodePointsToString(lowerText).
+        // 5. Let L be ! CodePointsToString(upperText).
        // 6. Return L.
-        Ok(js_string!(lower_text).into())
+        Ok(js_string!(text).into())
    }
-    /// `String.prototype.toUpperCase()`
+    /// [`String.prototype.toLocaleLowerCase ( [ locales ] )`][lower] and
-    ///
+    /// [`String.prototype.toLocaleUpperCase ( [ locales ] )`][upper]
    /// The `toUpperCase()` method returns the calling string value converted to uppercase.
    ///
    /// The value will be **converted** to a string if it isn't one
    ///
    /// More information:
    ///  - [ECMAScript reference][spec]
    ///  - [MDN documentation][mdn]
    ///
-    /// [spec]: https://tc39.es/ecma262/#sec-string.prototype.toUppercase
+    /// [lower]: https://tc39.es/ecma402/#sup-string.prototype.tolocalelowercase
-    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase
+    /// [upper]: https://tc39.es/ecma402/#sup-string.prototype.tolocaleuppercase
-    #[allow(clippy::wrong_self_convention)]
+    pub(crate) fn to_locale_case<const UPPER: bool>(
    pub(crate) fn to_uppercase(
        this: &JsValue,
-        _: &[JsValue],
+        args: &[JsValue],
        context: &mut Context<'_>,
    ) -> JsResult<JsValue> {
-        // This function behaves in exactly the same way as `String.prototype.toLowerCase`, except that the String is
+        #[cfg(feature = "intl")]
-        // mapped using the toUppercase algorithm of the Unicode Default Case Conversion.
+        {
-
+            use super::intl::locale::{
-        // Comments below are an adaptation of the `String.prototype.toLowerCase` documentation.
+                best_available_locale, canonicalize_locale_list, default_locale,
-
+            };
-        // 1. Let O be ? RequireObjectCoercible(this value).
+            use icu_casemapping::{provider::CaseMappingV1Marker, CaseMapping};
-        let this = this.require_object_coercible()?;
+            use icu_locid::LanguageIdentifier;
-
+
-        // 2. Let S be ? ToString(O).
+            // 1. Let O be ? RequireObjectCoercible(this value).
-        let string = this.to_string(context)?;
+            let this = this.require_object_coercible()?;
-
+
-        let mut code_points = string.code_points();
+            // 2. Let S be ? ToString(O).
-        let mut upper_text = Vec::with_capacity(string.len());
+            let string = this.to_string(context)?;
-        let mut next_unpaired_surrogate = None;
+
-
+            // 3. Return ? TransformCase(S, locales, lower).
-        // 3. Let sText be ! StringToCodePoints(S).
+
-        // 4. Let upperText be the result of toUppercase(sText), according to
+            //  TransformCase ( S, locales, targetCase )
-        // the Unicode Default Case Conversion algorithm.
+            // https://tc39.es/ecma402/#sec-transform-case
-        loop {
+
-            let only_chars = code_points
+            // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
-                .by_ref()
+            // 2. If requestedLocales is not an empty List, then
-                .map_while(|cpoint| match cpoint {
+            //     a. Let requestedLocale be requestedLocales[0].
-                    CodePoint::Unicode(c) => Some(c),
+            let lang = canonicalize_locale_list(args.get_or_undefined(0), context)?
-                    CodePoint::UnpairedSurrogate(s) => {
+                .into_iter()
-                        next_unpaired_surrogate = Some(s);
+                .next()
-                        None
+                // 3. Else,
-                    }
+                //     a. Let requestedLocale be ! DefaultLocale().
-                })
+                .unwrap_or_else(|| default_locale(context.icu().locale_canonicalizer()))
-                .collect::<std::string::String>()
+                .id;
-                .to_uppercase();
+            // 4. Let noExtensionsLocale be the String value that is requestedLocale with any Unicode locale extension sequences (6.2.1) removed.
-
+            // 5. Let availableLocales be a List with language tags that includes the languages for which the Unicode Character Database contains language sensitive case mappings. Implementations may add additional language tags if they support case mapping for additional locales.
-            upper_text.extend(only_chars.encode_utf16());
+            // 6. Let locale be ! BestAvailableLocale(availableLocales, noExtensionsLocale).
            // 7. If locale is undefined, set locale to "und".
            let lang =
                best_available_locale::<CaseMappingV1Marker>(lang, &context.icu().provider())
                    .unwrap_or(LanguageIdentifier::UND);
            let casemapper =
                CaseMapping::try_new_with_locale(&context.icu().provider(), &lang.into())
                    .map_err(|err| JsNativeError::typ().with_message(err.to_string()))?;
            // 8. Let codePoints be StringToCodePoints(S).
            let result = string.map_valid_segments(|segment| {
                if UPPER {
                    // 10. Else,
                    //     a. Assert: targetCase is upper.
                    //     b. Let newCodePoints be a List whose elements are the result of an uppercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm.
                    casemapper.to_full_uppercase(&segment)
                } else {
                    // 9. If targetCase is lower, then
                    //     a. Let newCodePoints be a List whose elements are the result of a lowercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm.
                    casemapper.to_full_lowercase(&segment)
                }
            });
-            if let Some(surr) = next_unpaired_surrogate.take() {
+            // 11. Return CodePointsToString(newCodePoints).
-                upper_text.push(surr);
+            Ok(result.into())
            } else {
                break;
            }
        }
-        // 5. Let L be ! CodePointsToString(upperText).
+        #[cfg(not(feature = "intl"))]
-        // 6. Return L.
+        {
-        Ok(js_string!(upper_text).into())
+            Self::to_case::<UPPER>(this, args, context)
        }
    }
    /// `String.prototype.substring( indexStart[, indexEnd] )`
--- a/boa_engine/src/string/mod.rs
+++ b/boa_engine/src/string/mod.rs
@ -37,6 +37,7 @@ use std::{
    cell::Cell,
    convert::Infallible,
    hash::{Hash, Hasher},
    iter::Peekable,
    ops::{Deref, Index},
    process::abort,
    ptr::{self, addr_of, addr_of_mut, NonNull},
@ -285,6 +286,74 @@ impl JsString {
        String::from_utf16(self)
    }
    /// Decodes a [`JsString`] into an iterator of [`Result<String, u16>`], returning surrogates as
    /// errors.
    pub fn to_std_string_with_surrogates(&self) -> impl Iterator<Item = Result<String, u16>> + '_ {
        struct WideStringDecoderIterator<I: Iterator> {
            codepoints: Peekable<I>,
        }
        impl<I: Iterator> WideStringDecoderIterator<I> {
            fn new(iterator: I) -> Self {
                WideStringDecoderIterator {
                    codepoints: iterator.peekable(),
                }
            }
        }
        impl<I> Iterator for WideStringDecoderIterator<I>
        where
            I: Iterator<Item = CodePoint>,
        {
            type Item = Result<String, u16>;
            fn next(&mut self) -> Option<Self::Item> {
                let cp = self.codepoints.next()?;
                let char = match cp {
                    CodePoint::Unicode(c) => c,
                    CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)),
                };
                let mut string = String::from(char);
                loop {
                    let Some(cp) = self.codepoints.peek().and_then(|cp| match cp {
                        CodePoint::Unicode(c) => Some(*c),
                        CodePoint::UnpairedSurrogate(_) => None,
                    }) else { break; };
                    string.push(cp);
                    self.codepoints
                        .next()
                        .expect("should exist by the check above");
                }
                Some(Ok(string))
            }
        }
        WideStringDecoderIterator::new(self.code_points())
    }
    /// Maps the valid segments of an UTF16 string and leaves the unpaired surrogates unchanged.
    #[must_use]
    pub fn map_valid_segments<F>(&self, mut f: F) -> Self
    where
        F: FnMut(String) -> String,
    {
        let mut text = Vec::new();
        for part in self.to_std_string_with_surrogates() {
            match part {
                Ok(string) => text.extend(f(string).encode_utf16()),
                Err(surr) => text.push(surr),
            }
        }
        js_string!(text)
    }
    /// Gets an iterator of all the Unicode codepoints of a [`JsString`].
    pub fn code_points(&self) -> impl Iterator<Item = CodePoint> + '_ {
        char::decode_utf16(self.iter().copied()).map(|res| match res {
--- a/boa_icu_provider/data/icudata.postcard
+++ b/boa_icu_provider/data/icudata.postcard
--- a/boa_icu_provider/src/bin/datagen.rs
+++ b/boa_icu_provider/src/bin/datagen.rs
@ -6,7 +6,7 @@
 use std::{error::Error, fs::File};
 use boa_icu_provider::data_root;
-use icu_datagen::{all_keys, datagen, CldrLocaleSubset, Out, SourceData};
+use icu_datagen::{all_keys_with_experimental, datagen, CldrLocaleSubset, Out, SourceData};
 fn main() -> Result<(), Box<dyn Error>> {
    simple_logger::SimpleLogger::new()
@ -23,5 +23,11 @@ fn main() -> Result<(), Box<dyn Error>> {
        data_root().join("icudata.postcard"),
    )?));
-    datagen(None, &all_keys(), &source_data, [blob_out].into()).map_err(Into::into)
+    datagen(
        None,
        &all_keys_with_experimental(),
        &source_data,
        [blob_out].into(),
    )
    .map_err(Into::into)
 }