Browse Source

Implement `String.prototype.toLocaleUpper/LowerCase` (#2822)

This fixes some more ES5 tests that were failing because the functions haven't been implemented.

It changes the following:

- Adds `String::to_locale_case`, which uses ICU4X to convert strings to uppercase or lowercase.
- Refactors `String::to_uppercase` and `String::to_lowercase` into a single `String::to_case` which uses a const generic to distinguish each case.
- Adds utility functions on `JsString` to avoid code repetition.
pull/2825/head
José Julián Espina 2 years ago
parent
commit
a12f10e335
  1. 1
      Cargo.lock
  2. 2
      boa_engine/Cargo.toml
  3. 175
      boa_engine/src/builtins/string/mod.rs
  4. 69
      boa_engine/src/string/mod.rs
  5. BIN
      boa_icu_provider/data/icudata.postcard
  6. 10
      boa_icu_provider/src/bin/datagen.rs

1
Cargo.lock generated

@ -406,6 +406,7 @@ dependencies = [
"fast-float", "fast-float",
"float-cmp", "float-cmp",
"icu_calendar", "icu_calendar",
"icu_casemapping",
"icu_collator", "icu_collator",
"icu_datetime", "icu_datetime",
"icu_list", "icu_list",

2
boa_engine/Cargo.toml

@ -23,6 +23,7 @@ intl = [
"dep:icu_provider", "dep:icu_provider",
"dep:icu_calendar", "dep:icu_calendar",
"dep:icu_collator", "dep:icu_collator",
"dep:icu_casemapping",
"dep:icu_list", "dep:icu_list",
"dep:writeable", "dep:writeable",
"dep:sys-locale", "dep:sys-locale",
@ -83,6 +84,7 @@ icu_collator = { version = "1.1.0", features = ["serde"], optional = true }
icu_plurals = { version = "1.1.0", features = ["serde"], optional = true } icu_plurals = { version = "1.1.0", features = ["serde"], optional = true }
icu_provider = { version = "1.1.0", optional = true } icu_provider = { version = "1.1.0", optional = true }
icu_list = { version = "1.1.0", features = ["serde"], optional = true } icu_list = { version = "1.1.0", features = ["serde"], optional = true }
icu_casemapping = { version = "0.7.1", features = ["serde"], optional = true}
writeable = { version = "0.5.2", optional = true } writeable = { version = "0.5.2", optional = true }
sys-locale = { version = "0.3.0", optional = true } sys-locale = { version = "0.3.0", optional = true }

175
boa_engine/src/builtins/string/mod.rs

@ -122,8 +122,10 @@ impl IntrinsicObject for String {
.method(Self::pad_end, "padEnd", 1) .method(Self::pad_end, "padEnd", 1)
.method(Self::pad_start, "padStart", 1) .method(Self::pad_start, "padStart", 1)
.method(Self::trim, "trim", 0) .method(Self::trim, "trim", 0)
.method(Self::to_lowercase, "toLowerCase", 0) .method(Self::to_case::<false>, "toLowerCase", 0)
.method(Self::to_uppercase, "toUpperCase", 0) .method(Self::to_case::<true>, "toUpperCase", 0)
.method(Self::to_locale_case::<false>, "toLocaleLowerCase", 0)
.method(Self::to_locale_case::<true>, "toLocaleUpperCase", 0)
.method(Self::substring, "substring", 2) .method(Self::substring, "substring", 2)
.method(Self::split, "split", 2) .method(Self::split, "split", 2)
.method(Self::value_of, "valueOf", 0) .method(Self::value_of, "valueOf", 0)
@ -1644,18 +1646,19 @@ impl String {
Ok(js_string!(string.trim_end()).into()) Ok(js_string!(string.trim_end()).into())
} }
/// `String.prototype.toLowerCase()` /// [`String.prototype.toUpperCase()`][upper] and [`String.prototype.toLowerCase()`][lower]
/// ///
/// The `toLowerCase()` method returns the calling string value converted to lower case. /// The case methods return the calling string value converted to uppercase or lowercase.
///
/// The value will be **converted** to a string if it isn't one.
/// ///
/// More information: /// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn] /// - [MDN documentation][mdn]
/// ///
/// [spec]: https://tc39.es/ecma262/#sec-string.prototype.tolowercase /// [upper]: https://tc39.es/ecma262/#sec-string.prototype.toUppercase
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toLowerCase /// [lower]: https://tc39.es/ecma262/#sec-string.prototype.toLowercase
#[allow(clippy::wrong_self_convention)] /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase
pub(crate) fn to_lowercase( pub(crate) fn to_case<const UPPER: bool>(
this: &JsValue, this: &JsValue,
_: &[JsValue], _: &[JsValue],
context: &mut Context<'_>, context: &mut Context<'_>,
@ -1666,101 +1669,95 @@ impl String {
// 2. Let S be ? ToString(O). // 2. Let S be ? ToString(O).
let string = this.to_string(context)?; let string = this.to_string(context)?;
let mut code_points = string.code_points();
let mut lower_text = Vec::with_capacity(string.len());
let mut next_unpaired_surrogate = None;
// 3. Let sText be ! StringToCodePoints(S). // 3. Let sText be ! StringToCodePoints(S).
// 4. Let lowerText be the result of toLowercase(sText), according to // 4. Let upperText be the result of toUppercase(sText), according to
// the Unicode Default Case Conversion algorithm. // the Unicode Default Case Conversion algorithm.
loop { let text = string.map_valid_segments(|s| {
let only_chars = code_points if UPPER {
.by_ref() s.to_uppercase()
.map_while(|cpoint| match cpoint {
CodePoint::Unicode(c) => Some(c),
CodePoint::UnpairedSurrogate(s) => {
next_unpaired_surrogate = Some(s);
None
}
})
.collect::<std::string::String>()
.to_lowercase();
lower_text.extend(only_chars.encode_utf16());
if let Some(surr) = next_unpaired_surrogate.take() {
lower_text.push(surr);
} else { } else {
break; s.to_lowercase()
} }
} });
// 5. Let L be ! CodePointsToString(lowerText). // 5. Let L be ! CodePointsToString(upperText).
// 6. Return L. // 6. Return L.
Ok(js_string!(lower_text).into()) Ok(js_string!(text).into())
} }
/// `String.prototype.toUpperCase()` /// [`String.prototype.toLocaleLowerCase ( [ locales ] )`][lower] and
/// /// [`String.prototype.toLocaleUpperCase ( [ locales ] )`][upper]
/// The `toUpperCase()` method returns the calling string value converted to uppercase.
///
/// The value will be **converted** to a string if it isn't one
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
/// ///
/// [spec]: https://tc39.es/ecma262/#sec-string.prototype.toUppercase /// [lower]: https://tc39.es/ecma402/#sup-string.prototype.tolocalelowercase
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase /// [upper]: https://tc39.es/ecma402/#sup-string.prototype.tolocaleuppercase
#[allow(clippy::wrong_self_convention)] pub(crate) fn to_locale_case<const UPPER: bool>(
pub(crate) fn to_uppercase(
this: &JsValue, this: &JsValue,
_: &[JsValue], args: &[JsValue],
context: &mut Context<'_>, context: &mut Context<'_>,
) -> JsResult<JsValue> { ) -> JsResult<JsValue> {
// This function behaves in exactly the same way as `String.prototype.toLowerCase`, except that the String is #[cfg(feature = "intl")]
// mapped using the toUppercase algorithm of the Unicode Default Case Conversion. {
use super::intl::locale::{
// Comments below are an adaptation of the `String.prototype.toLowerCase` documentation. best_available_locale, canonicalize_locale_list, default_locale,
};
// 1. Let O be ? RequireObjectCoercible(this value). use icu_casemapping::{provider::CaseMappingV1Marker, CaseMapping};
let this = this.require_object_coercible()?; use icu_locid::LanguageIdentifier;
// 2. Let S be ? ToString(O). // 1. Let O be ? RequireObjectCoercible(this value).
let string = this.to_string(context)?; let this = this.require_object_coercible()?;
let mut code_points = string.code_points(); // 2. Let S be ? ToString(O).
let mut upper_text = Vec::with_capacity(string.len()); let string = this.to_string(context)?;
let mut next_unpaired_surrogate = None;
// 3. Return ? TransformCase(S, locales, lower).
// 3. Let sText be ! StringToCodePoints(S).
// 4. Let upperText be the result of toUppercase(sText), according to // TransformCase ( S, locales, targetCase )
// the Unicode Default Case Conversion algorithm. // https://tc39.es/ecma402/#sec-transform-case
loop {
let only_chars = code_points // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
.by_ref() // 2. If requestedLocales is not an empty List, then
.map_while(|cpoint| match cpoint { // a. Let requestedLocale be requestedLocales[0].
CodePoint::Unicode(c) => Some(c), let lang = canonicalize_locale_list(args.get_or_undefined(0), context)?
CodePoint::UnpairedSurrogate(s) => { .into_iter()
next_unpaired_surrogate = Some(s); .next()
None // 3. Else,
} // a. Let requestedLocale be ! DefaultLocale().
}) .unwrap_or_else(|| default_locale(context.icu().locale_canonicalizer()))
.collect::<std::string::String>() .id;
.to_uppercase(); // 4. Let noExtensionsLocale be the String value that is requestedLocale with any Unicode locale extension sequences (6.2.1) removed.
// 5. Let availableLocales be a List with language tags that includes the languages for which the Unicode Character Database contains language sensitive case mappings. Implementations may add additional language tags if they support case mapping for additional locales.
upper_text.extend(only_chars.encode_utf16()); // 6. Let locale be ! BestAvailableLocale(availableLocales, noExtensionsLocale).
// 7. If locale is undefined, set locale to "und".
let lang =
best_available_locale::<CaseMappingV1Marker>(lang, &context.icu().provider())
.unwrap_or(LanguageIdentifier::UND);
let casemapper =
CaseMapping::try_new_with_locale(&context.icu().provider(), &lang.into())
.map_err(|err| JsNativeError::typ().with_message(err.to_string()))?;
// 8. Let codePoints be StringToCodePoints(S).
let result = string.map_valid_segments(|segment| {
if UPPER {
// 10. Else,
// a. Assert: targetCase is upper.
// b. Let newCodePoints be a List whose elements are the result of an uppercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm.
casemapper.to_full_uppercase(&segment)
} else {
// 9. If targetCase is lower, then
// a. Let newCodePoints be a List whose elements are the result of a lowercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm.
casemapper.to_full_lowercase(&segment)
}
});
if let Some(surr) = next_unpaired_surrogate.take() { // 11. Return CodePointsToString(newCodePoints).
upper_text.push(surr); Ok(result.into())
} else {
break;
}
} }
// 5. Let L be ! CodePointsToString(upperText). #[cfg(not(feature = "intl"))]
// 6. Return L. {
Ok(js_string!(upper_text).into()) Self::to_case::<UPPER>(this, args, context)
}
} }
/// `String.prototype.substring( indexStart[, indexEnd] )` /// `String.prototype.substring( indexStart[, indexEnd] )`

69
boa_engine/src/string/mod.rs

@ -37,6 +37,7 @@ use std::{
cell::Cell, cell::Cell,
convert::Infallible, convert::Infallible,
hash::{Hash, Hasher}, hash::{Hash, Hasher},
iter::Peekable,
ops::{Deref, Index}, ops::{Deref, Index},
process::abort, process::abort,
ptr::{self, addr_of, addr_of_mut, NonNull}, ptr::{self, addr_of, addr_of_mut, NonNull},
@ -285,6 +286,74 @@ impl JsString {
String::from_utf16(self) String::from_utf16(self)
} }
/// Decodes a [`JsString`] into an iterator of [`Result<String, u16>`], returning surrogates as
/// errors.
pub fn to_std_string_with_surrogates(&self) -> impl Iterator<Item = Result<String, u16>> + '_ {
struct WideStringDecoderIterator<I: Iterator> {
codepoints: Peekable<I>,
}
impl<I: Iterator> WideStringDecoderIterator<I> {
fn new(iterator: I) -> Self {
WideStringDecoderIterator {
codepoints: iterator.peekable(),
}
}
}
impl<I> Iterator for WideStringDecoderIterator<I>
where
I: Iterator<Item = CodePoint>,
{
type Item = Result<String, u16>;
fn next(&mut self) -> Option<Self::Item> {
let cp = self.codepoints.next()?;
let char = match cp {
CodePoint::Unicode(c) => c,
CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)),
};
let mut string = String::from(char);
loop {
let Some(cp) = self.codepoints.peek().and_then(|cp| match cp {
CodePoint::Unicode(c) => Some(*c),
CodePoint::UnpairedSurrogate(_) => None,
}) else { break; };
string.push(cp);
self.codepoints
.next()
.expect("should exist by the check above");
}
Some(Ok(string))
}
}
WideStringDecoderIterator::new(self.code_points())
}
/// Maps the valid segments of an UTF16 string and leaves the unpaired surrogates unchanged.
#[must_use]
pub fn map_valid_segments<F>(&self, mut f: F) -> Self
where
F: FnMut(String) -> String,
{
let mut text = Vec::new();
for part in self.to_std_string_with_surrogates() {
match part {
Ok(string) => text.extend(f(string).encode_utf16()),
Err(surr) => text.push(surr),
}
}
js_string!(text)
}
/// Gets an iterator of all the Unicode codepoints of a [`JsString`]. /// Gets an iterator of all the Unicode codepoints of a [`JsString`].
pub fn code_points(&self) -> impl Iterator<Item = CodePoint> + '_ { pub fn code_points(&self) -> impl Iterator<Item = CodePoint> + '_ {
char::decode_utf16(self.iter().copied()).map(|res| match res { char::decode_utf16(self.iter().copied()).map(|res| match res {

BIN
boa_icu_provider/data/icudata.postcard

Binary file not shown.

10
boa_icu_provider/src/bin/datagen.rs

@ -6,7 +6,7 @@
use std::{error::Error, fs::File}; use std::{error::Error, fs::File};
use boa_icu_provider::data_root; use boa_icu_provider::data_root;
use icu_datagen::{all_keys, datagen, CldrLocaleSubset, Out, SourceData}; use icu_datagen::{all_keys_with_experimental, datagen, CldrLocaleSubset, Out, SourceData};
fn main() -> Result<(), Box<dyn Error>> { fn main() -> Result<(), Box<dyn Error>> {
simple_logger::SimpleLogger::new() simple_logger::SimpleLogger::new()
@ -23,5 +23,11 @@ fn main() -> Result<(), Box<dyn Error>> {
data_root().join("icudata.postcard"), data_root().join("icudata.postcard"),
)?)); )?));
datagen(None, &all_keys(), &source_data, [blob_out].into()).map_err(Into::into) datagen(
None,
&all_keys_with_experimental(),
&source_data,
[blob_out].into(),
)
.map_err(Into::into)
} }

Loading…
Cancel
Save