Browse Source

Bump ICU4X to 1.5 and cleanup Intl (#3868)

pull/3869/head
José Julián Espina 6 months ago committed by GitHub
parent
commit
8c727f6d52
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 1407
      Cargo.lock
  2. 46
      Cargo.toml
  3. 1
      core/engine/Cargo.toml
  4. 15
      core/engine/src/builtins/intl/collator/mod.rs
  5. 8
      core/engine/src/builtins/intl/list_format/mod.rs
  6. 2
      core/engine/src/builtins/intl/locale/mod.rs
  7. 17
      core/engine/src/builtins/intl/locale/tests.rs
  8. 420
      core/engine/src/builtins/intl/locale/utils.rs
  9. 10
      core/engine/src/builtins/intl/mod.rs
  10. 8
      core/engine/src/builtins/intl/number_format/mod.rs
  11. 8
      core/engine/src/builtins/intl/plural_rules/mod.rs
  12. 19
      core/engine/src/builtins/intl/segmenter/mod.rs
  13. 34
      core/engine/src/builtins/string/mod.rs
  14. 22
      core/engine/src/context/mod.rs
  15. BIN
      core/icu_provider/data/icudata.postcard
  16. 2
      test262_config.toml
  17. 4
      tests/tester/src/edition.rs
  18. 18
      tools/gen-icu4x-data/Cargo.toml
  19. 124
      tools/gen-icu4x-data/src/main.rs

1407
Cargo.lock generated

File diff suppressed because it is too large Load Diff

46
Cargo.toml

@ -75,7 +75,7 @@ thin-vec = "0.2.13"
time = {version = "0.3.36", default-features = false, features = ["local-offset", "large-dates", "wasm-bindgen", "parsing", "formatting", "macros"]} time = {version = "0.3.36", default-features = false, features = ["local-offset", "large-dates", "wasm-bindgen", "parsing", "formatting", "macros"]}
tinystr = "0.7.5" tinystr = "0.7.5"
log = "0.4.21" log = "0.4.21"
simple_logger = "4.3.3" simple_logger = "5.0.0"
cargo_metadata = "0.18.1" cargo_metadata = "0.18.1"
trybuild = "1.0.95" trybuild = "1.0.95"
rayon = "1.10.0" rayon = "1.10.0"
@ -107,7 +107,7 @@ tap = "1.0.1"
thiserror = "1.0.60" thiserror = "1.0.60"
dashmap = "5.5.3" dashmap = "5.5.3"
num_enum = "0.7.2" num_enum = "0.7.2"
itertools = { version = "0.12.1", default-features = false } itertools = { version = "0.13.0", default-features = false }
portable-atomic = "1.6.0" portable-atomic = "1.6.0"
bytemuck = { version = "1.15.0", default-features = false } bytemuck = { version = "1.15.0", default-features = false }
arrayvec = "0.7.4" arrayvec = "0.7.4"
@ -115,7 +115,7 @@ intrusive-collections = "0.9.6"
cfg-if = "1.0.0" cfg-if = "1.0.0"
either = "1.12.0" either = "1.12.0"
sys-locale = "0.3.1" sys-locale = "0.3.1"
temporal_rs = "0.0.2" temporal_rs = { git = "https://github.com/boa-dev/temporal.git", rev = "61a05fbb7c72353deda72a3df0e6887d65b840d2" }
web-time = "1.1.0" web-time = "1.1.0"
criterion = "0.5.1" criterion = "0.5.1"
float-cmp = "0.9.0" float-cmp = "0.9.0"
@ -125,26 +125,26 @@ winapi = { version = "0.3.9", default-features = false }
# ICU4X # ICU4X
icu_provider = { version = "~1.4.0", default-features = false } icu_provider = { version = "~1.5.0", default-features = false }
icu_locid = { version = "~1.4.0", default-features = false } icu_locid = { version = "~1.5.0", default-features = false }
icu_locid_transform = { version = "~1.4.0", default-features = false } icu_locid_transform = { version = "~1.5.0", default-features = false }
icu_datetime = { version = "~1.4.0", default-features = false } icu_datetime = { version = "~1.5.0", default-features = false }
icu_calendar = { version = "~1.4.0", default-features = false } icu_calendar = { version = "~1.5.0", default-features = false }
icu_collator = { version = "~1.4.0", default-features = false } icu_collator = { version = "~1.5.0", default-features = false }
icu_plurals = { version = "~1.4.0", default-features = false } icu_plurals = { version = "~1.5.0", default-features = false }
icu_list = { version = "~1.4.0", default-features = false } icu_list = { version = "~1.5.0", default-features = false }
icu_casemap = { version = "~1.4.0", default-features = false } icu_casemap = { version = "~1.5.0", default-features = false }
icu_segmenter = { version = "~1.4.0", default-features = false } icu_segmenter = { version = "~1.5.0", default-features = false }
icu_datagen = { version = "~1.4.1", default-features = false } icu_datagen = { version = "~1.5.0", default-features = false }
icu_provider_adapters = { version = "~1.4.0", default-features = false } icu_provider_adapters = { version = "~1.5.0", default-features = false }
icu_provider_blob = { version = "~1.4.0", default-features = false } icu_provider_blob = { version = "~1.5.0", default-features = false }
icu_properties = { version = "~1.4.1", default-features = true } icu_properties = { version = "~1.5.0", default-features = true }
icu_normalizer = { version = "~1.4.2", default-features = false } icu_normalizer = { version = "~1.5.0", default-features = false }
icu_decimal = { version = "~1.4.0", default-features = false } icu_decimal = { version = "~1.5.0", default-features = false }
writeable = "~0.5.4" writeable = "~0.5.5"
yoke = "~0.7.3" yoke = "~0.7.4"
zerofrom = "~0.1.3" zerofrom = "~0.1.4"
fixed_decimal = "~0.5.5" fixed_decimal = "~0.5.6"
[workspace.metadata.workspaces] [workspace.metadata.workspaces]
allow_branch = "main" allow_branch = "main"

1
core/engine/Cargo.toml

@ -105,6 +105,7 @@ cfg-if.workspace = true
time.workspace = true time.workspace = true
hashbrown.workspace = true hashbrown.workspace = true
either = { workspace = true, optional = true } either = { workspace = true, optional = true }
static_assertions.workspace = true
# intl deps # intl deps
boa_icu_provider = { workspace = true, features = ["std"], optional = true } boa_icu_provider = { workspace = true, features = ["std"], optional = true }

15
core/engine/src/builtins/intl/collator/mod.rs

@ -2,8 +2,7 @@ use boa_gc::{custom_trace, Finalize, Trace};
use boa_macros::js_str; use boa_macros::js_str;
use boa_profiler::Profiler; use boa_profiler::Profiler;
use icu_collator::{ use icu_collator::{
provider::CollationMetadataV1Marker, AlternateHandling, CaseFirst, Collator as NativeCollator, provider::CollationMetadataV1Marker, AlternateHandling, CaseFirst, MaxVariable, Numeric,
MaxVariable, Numeric,
}; };
use icu_locid::{ use icu_locid::{
@ -35,7 +34,7 @@ use crate::{
}; };
use super::{ use super::{
locale::{canonicalize_locale_list, resolve_locale, supported_locales, validate_extension}, locale::{canonicalize_locale_list, filter_locales, resolve_locale, validate_extension},
options::{coerce_options_to_object, IntlOptions}, options::{coerce_options_to_object, IntlOptions},
Service, Service,
}; };
@ -53,7 +52,7 @@ pub(crate) struct Collator {
usage: Usage, usage: Usage,
sensitivity: Sensitivity, sensitivity: Sensitivity,
ignore_punctuation: bool, ignore_punctuation: bool,
collator: NativeCollator, collator: icu_collator::Collator,
bound_compare: Option<JsFunction>, bound_compare: Option<JsFunction>,
} }
@ -277,7 +276,7 @@ impl BuiltInConstructor for Collator {
// 18. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]]. // 18. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]].
// 19. Let r be ResolveLocale(%Collator%.[[AvailableLocales]], requestedLocales, opt, relevantExtensionKeys, localeData). // 19. Let r be ResolveLocale(%Collator%.[[AvailableLocales]], requestedLocales, opt, relevantExtensionKeys, localeData).
let mut locale = resolve_locale::<Self>( let mut locale = resolve_locale::<Self>(
&requested_locales, requested_locales,
&mut intl_options, &mut intl_options,
context.intl_provider(), context.intl_provider(),
); );
@ -337,7 +336,7 @@ impl BuiltInConstructor for Collator {
.unzip(); .unzip();
let collator = let collator =
NativeCollator::try_new_unstable(context.intl_provider(), &collator_locale, { icu_collator::Collator::try_new_unstable(context.intl_provider(), &collator_locale, {
let mut options = icu_collator::CollatorOptions::new(); let mut options = icu_collator::CollatorOptions::new();
options.strength = strength; options.strength = strength;
options.case_level = case_level; options.case_level = case_level;
@ -395,8 +394,8 @@ impl Collator {
// 2. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 2. Let requestedLocales be ? CanonicalizeLocaleList(locales).
let requested_locales = canonicalize_locale_list(locales, context)?; let requested_locales = canonicalize_locale_list(locales, context)?;
// 3. Return ? SupportedLocales(availableLocales, requestedLocales, options). // 3. Return ? FilterLocales(availableLocales, requestedLocales, options).
supported_locales::<<Self as Service>::LangMarker>(&requested_locales, options, context) filter_locales::<<Self as Service>::LangMarker>(requested_locales, options, context)
.map(JsValue::from) .map(JsValue::from)
} }

8
core/engine/src/builtins/intl/list_format/mod.rs

@ -23,7 +23,7 @@ use crate::{
}; };
use super::{ use super::{
locale::{canonicalize_locale_list, resolve_locale, supported_locales}, locale::{canonicalize_locale_list, filter_locales, resolve_locale},
options::IntlOptions, options::IntlOptions,
Service, Service,
}; };
@ -122,7 +122,7 @@ impl BuiltInConstructor for ListFormat {
// 9. Let r be ResolveLocale(%ListFormat%.[[AvailableLocales]], requestedLocales, opt, %ListFormat%.[[RelevantExtensionKeys]], localeData). // 9. Let r be ResolveLocale(%ListFormat%.[[AvailableLocales]], requestedLocales, opt, %ListFormat%.[[RelevantExtensionKeys]], localeData).
// 10. Set listFormat.[[Locale]] to r.[[locale]]. // 10. Set listFormat.[[Locale]] to r.[[locale]].
let locale = resolve_locale::<Self>( let locale = resolve_locale::<Self>(
&requested_locales, requested_locales,
&mut IntlOptions { &mut IntlOptions {
matcher, matcher,
..Default::default() ..Default::default()
@ -204,8 +204,8 @@ impl ListFormat {
// 2. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 2. Let requestedLocales be ? CanonicalizeLocaleList(locales).
let requested_locales = canonicalize_locale_list(locales, context)?; let requested_locales = canonicalize_locale_list(locales, context)?;
// 3. Return ? SupportedLocales(availableLocales, requestedLocales, options). // 3. Return ? FilterLocales(availableLocales, requestedLocales, options).
supported_locales::<<Self as Service>::LangMarker>(&requested_locales, options, context) filter_locales::<<Self as Service>::LangMarker>(requested_locales, options, context)
.map(JsValue::from) .map(JsValue::from)
} }

2
core/engine/src/builtins/intl/locale/mod.rs

@ -7,7 +7,7 @@ use icu_locid::{
extensions::unicode::Value, extensions_unicode_key as key, extensions_unicode_value as value, extensions::unicode::Value, extensions_unicode_key as key, extensions_unicode_value as value,
}; };
#[cfg(test)] #[cfg(all(test, feature = "intl_bundled"))]
mod tests; mod tests;
mod utils; mod utils;

17
core/engine/src/builtins/intl/locale/tests.rs

@ -11,7 +11,7 @@ use icu_provider::{DataLocale, DataProvider, DataRequest, DataRequestMetadata};
use crate::{ use crate::{
builtins::intl::{ builtins::intl::{
locale::{best_locale_for_provider, default_locale, resolve_locale}, locale::{default_locale, resolve_locale},
options::{IntlOptions, LocaleMatcher}, options::{IntlOptions, LocaleMatcher},
Service, Service,
}, },
@ -88,7 +88,7 @@ fn locale_resolution() {
hc: Some(HourCycle::H11), hc: Some(HourCycle::H11),
}, },
}; };
let locale = resolve_locale::<TestService>(&[], &mut options, &provider); let locale = resolve_locale::<TestService>([], &mut options, &provider);
assert_eq!(locale, default); assert_eq!(locale, default);
// test best fit // test best fit
@ -99,15 +99,8 @@ fn locale_resolution() {
}, },
}; };
let locale = resolve_locale::<TestService>(&[], &mut options, &provider); let locale = resolve_locale::<TestService>([], &mut options, &provider);
let best = best_locale_for_provider::<<TestService as Service>::LangMarker>( assert_eq!(locale, default);
default.id.clone(),
&provider,
)
.unwrap();
let mut best = Locale::from(best);
best.extensions = locale.extensions.clone();
assert_eq!(locale, best);
// requested: [es-ES] // requested: [es-ES]
let mut options = IntlOptions { let mut options = IntlOptions {
@ -115,6 +108,6 @@ fn locale_resolution() {
service_options: TestOptions { hc: None }, service_options: TestOptions { hc: None },
}; };
let locale = resolve_locale::<TestService>(&[locale!("es-AR")], &mut options, &provider); let locale = resolve_locale::<TestService>([locale!("es-AR")], &mut options, &provider);
assert_eq!(locale, "es-u-hc-h23".parse().unwrap()); assert_eq!(locale, "es-u-hc-h23".parse().unwrap());
} }

420
core/engine/src/builtins/intl/locale/utils.rs

@ -14,18 +14,13 @@ use crate::{
}; };
use boa_macros::js_str; use boa_macros::js_str;
use icu_collator::provider::CollationMetadataV1Marker;
use icu_locid::{ use icu_locid::{
extensions::unicode::{Key, Value}, extensions::unicode::{Key, Value},
subtags::Variants, subtags::Variants,
LanguageIdentifier, Locale, LanguageIdentifier, Locale,
}; };
use icu_locid_transform::LocaleCanonicalizer; use icu_locid_transform::LocaleCanonicalizer;
use icu_provider::{ use icu_provider::{DataLocale, DataProvider, DataRequest, DataRequestMetadata, KeyedDataMarker};
DataError, DataErrorKind, DataLocale, DataProvider, DataRequest, DataRequestMetadata,
KeyedDataMarker,
};
use icu_segmenter::provider::WordBreakDataV1Marker;
use indexmap::IndexSet; use indexmap::IndexSet;
use tap::TapOptional; use tap::TapOptional;
@ -153,36 +148,55 @@ pub(crate) fn canonicalize_locale_list(
Ok(seen.into_iter().collect()) Ok(seen.into_iter().collect())
} }
/// Abstract operation `BestAvailableLocale ( availableLocales, locale )` /// Abstract operation [`LookupMatchingLocaleByPrefix ( availableLocales, requestedLocales )`][prefix]
/// and [`LookupMatchingLocaleByBestFit ( availableLocales, requestedLocales )`][best]
/// ///
/// Compares the provided argument `locale`, which must be a String value with a /// Compares `requestedLocales`, which must be a `List` as returned by `CanonicalizeLocaleList`,
/// structurally valid and canonicalized Unicode BCP 47 locale identifier, against /// against the locales in `availableLocales` and determines the best available language to
/// the locales in `availableLocales` and returns either the longest non-empty prefix /// meet the request.
/// of `locale` that is an element of `availableLocales`, or undefined if there is no
/// such element.
/// ///
/// We only work with language identifiers, which have the same semantics /// # Notes
/// but are a bit easier to manipulate.
/// ///
/// More information: /// - This differs a bit from the spec, since we don't have an `[[AvailableLocales]]`
/// - [ECMAScript reference][spec] /// list to compare with. However, we can do data requests to a [`DataProvider`]
/// in order to see if a certain [`Locale`] is supported.
/// ///
/// [spec]: https://tc39.es/ecma402/#sec-bestavailablelocale /// - Calling this function with a singleton `KeyedDataMarker` will always return `None`.
pub(crate) fn best_available_locale<M: KeyedDataMarker>( ///
candidate: LanguageIdentifier, /// [prefix]: https://tc39.es/ecma402/#sec-lookupmatchinglocalebyprefix
provider: &(impl DataProvider<M> + ?Sized), /// [best]: https://tc39.es/ecma402/#sec-lookupmatchinglocalebybestfit
) -> Option<LanguageIdentifier> { pub(crate) fn lookup_matching_locale_by_prefix<M: KeyedDataMarker>(
// 1. Let candidate be locale. requested_locales: impl IntoIterator<Item = Locale>,
let mut candidate = candidate.into(); provider: &IntlProvider,
// 2. Repeat ) -> Option<Locale>
where
IntlProvider: DataProvider<M>,
{
// 1. For each element locale of requestedLocales, do
for locale in requested_locales {
// a. Let extension be empty.
// b. If locale contains a Unicode locale extension sequence, then
// i. Set extension to the Unicode locale extension sequence of locale.
// ii. Set locale to the String value that is locale with any Unicode locale extension sequences removed.
let mut locale = locale.clone();
let id = std::mem::take(&mut locale.id);
locale.extensions.transform.clear();
locale.extensions.private.clear();
// c. Let prefix be locale.
let mut prefix = id.into();
// d. Repeat, while prefix is not the empty String,
// We don't use a `while !prefix.is_und()` because it could be that prefix is und at the start,
// so we need to make the request at least once.
loop { loop {
// a. If availableLocales contains an element equal to candidate, return candidate. // i. If availableLocales contains prefix, return the Record { [[locale]]: prefix, [[extension]]: extension }.
// ICU4X requires doing data requests in order to check if a locale // ICU4X requires doing data requests in order to check if a locale
// is part of the set of supported locales. // is part of the set of supported locales.
let response = DataProvider::<M>::load( let response = DataProvider::<M>::load(
provider, provider,
DataRequest { DataRequest {
locale: &candidate, locale: &prefix,
metadata: { metadata: {
let mut metadata = DataRequestMetadata::default(); let mut metadata = DataRequestMetadata::default();
metadata.silent = true; metadata.silent = true;
@ -191,174 +205,51 @@ pub(crate) fn best_available_locale<M: KeyedDataMarker>(
}, },
); );
match response { if let Ok(req) = response {
Ok(req) => {
// `metadata.locale` returns None when the provider doesn't have a fallback mechanism, // `metadata.locale` returns None when the provider doesn't have a fallback mechanism,
// but supports the required locale. However, if the provider has a fallback mechanism, // but supports the required locale. However, if the provider has a fallback mechanism,
// this will return `Some(locale)`, where the locale is the used locale after applying // this will return `Some(locale)`, where the locale is the used locale after applying
// the fallback algorithm, even if the used locale is exactly the same as the required // the fallback algorithm, even if the used locale is exactly the same as the required
// locale. // locale.
match req.metadata.locale { match req.metadata.locale {
// TODO: ugly hack to accept locales that fallback to "und" in the collator/segmenter services Some(loc) if loc.get_langid() == prefix.get_langid() => {
Some(loc) locale.id = loc.into_locale().id;
if loc == candidate return Some(locale);
|| (loc.is_empty()
&& [
CollationMetadataV1Marker::KEY.path(),
WordBreakDataV1Marker::KEY.path(),
]
.contains(&M::KEY.path())) =>
{
return Some(candidate.into_locale().id)
}
None => return Some(candidate.into_locale().id),
_ => {}
} }
None => {
locale.id = prefix.into_locale().id;
return Some(locale);
} }
Err(DataError { _ => {}
kind: DataErrorKind::ExtraneousLocale,
..
}) => {
// This is essentially the same hack as above but for singleton keys
return Some(candidate.into_locale().id);
} }
Err(_) => {}
} }
// b. Let pos be the character index of the last occurrence of "-" (U+002D) within candidate. If that character does not occur, return undefined. // ii. If prefix contains "-" (code unit 0x002D HYPHEN-MINUS), let pos be the index into prefix of the last occurrence of "-"; else let pos be 0.
// c. If pos ≥ 2 and the character "-" occurs at index pos-2 of candidate, decrease pos by 2. // iii. Repeat, while pos ≥ 2 and the substring of prefix from pos - 2 to pos - 1 is "-",
// d. Let candidate be the substring of candidate from position 0, inclusive, to position pos, exclusive. // 1. Set pos to pos - 2.
// // iv. Set prefix to the substring of prefix from 0 to pos.
// Since the definition of `LanguageIdentifier` allows us to manipulate it // Since the definition of `LanguageIdentifier` allows us to manipulate it
// without using strings, we can replace these steps by a simpler // without using strings, we can replace these steps by a simpler
// algorithm. // algorithm.
if prefix.has_variants() {
if candidate.has_variants() { let mut variants = prefix.clear_variants().iter().copied().collect::<Vec<_>>();
let mut variants = candidate
.clear_variants()
.iter()
.copied()
.collect::<Vec<_>>();
variants.pop(); variants.pop();
candidate.set_variants(Variants::from_vec_unchecked(variants)); prefix.set_variants(Variants::from_vec_unchecked(variants));
} else if candidate.region().is_some() { } else if prefix.region().is_some() {
candidate.set_region(None); prefix.set_region(None);
} else if candidate.script().is_some() { } else if prefix.script().is_some() {
candidate.set_script(None); prefix.set_script(None);
} else { } else {
return None; break;
} }
} }
}
/// Returns the locale resolved by the `provider` after using the ICU4X fallback
/// algorithm with `candidate` (if the provider supports this), or None if the locale is not
/// supported.
pub(crate) fn best_locale_for_provider<M: KeyedDataMarker>(
candidate: LanguageIdentifier,
provider: &(impl DataProvider<M> + ?Sized),
) -> Option<LanguageIdentifier> {
// another hack to the list...
// This time is because markers like `WordBreakDataV1Marker` throw an error if they receive
// a request with a locale, because they don't really need it. In this case, we can
// check if the key is one of those kinds and return the candidate as it is.
if M::KEY.metadata().singleton {
return Some(candidate);
}
let response = DataProvider::<M>::load(
provider,
DataRequest {
locale: &DataLocale::from(&candidate),
metadata: {
let mut md = DataRequestMetadata::default();
md.silent = true;
md
},
},
)
.ok()?;
if candidate == LanguageIdentifier::UND {
return Some(LanguageIdentifier::UND);
}
response
.metadata
.locale
.map(|dl| {
// TODO: ugly hack to accept locales that fallback to "und" in the collator/segmenter services
if [
CollationMetadataV1Marker::KEY.path(),
WordBreakDataV1Marker::KEY.path(),
]
.contains(&M::KEY.path())
&& dl.is_empty()
{
candidate.clone()
} else {
dl.into_locale().id
}
})
.or(Some(candidate))
.filter(|loc| loc != &LanguageIdentifier::UND)
}
/// Abstract operation [`LookupMatcher ( availableLocales, requestedLocales )`][spec]
///
/// Compares `requestedLocales`, which must be a `List` as returned by `CanonicalizeLocaleList`,
/// against the locales in `availableLocales` and determines the best available language to
/// meet the request.
///
/// # Note
///
/// This differs a bit from the spec, since we don't have an `[[AvailableLocales]]`
/// list to compare with. However, we can do data requests to a [`DataProvider`]
/// in order to see if a certain [`Locale`] is supported.
///
/// [spec]: https://tc39.es/ecma402/#sec-lookupmatcher
fn lookup_matcher<M: KeyedDataMarker>(
requested_locales: &[Locale],
provider: &IntlProvider,
) -> Locale
where
IntlProvider: DataProvider<M>,
{
// 1. Let result be a new Record.
// 2. For each element locale of requestedLocales, do
for locale in requested_locales {
// a. Let noExtensionsLocale be the String value that is locale with any Unicode locale
// extension sequences removed.
let mut locale = locale.clone();
let id = std::mem::take(&mut locale.id);
locale.extensions.transform.clear();
locale.extensions.private.clear();
// b. Let availableLocale be ! BestAvailableLocale(availableLocales, noExtensionsLocale).
let available_locale = best_available_locale::<M>(id, provider);
// c. If availableLocale is not undefined, then
if let Some(available_locale) = available_locale {
// i. Set result.[[locale]] to availableLocale.
// Assignment deferred. See return statement below.
// ii. If locale and noExtensionsLocale are not the same String value, then
// 1. Let extension be the String value consisting of the substring of the Unicode
// locale extension sequence within locale.
// 2. Set result.[[extension]] to extension.
locale.id = available_locale;
// iii. Return result.
return locale;
}
} }
// 3. Let defLocale be ! DefaultLocale(). // 2. Return undefined.
// 4. Set result.[[locale]] to defLocale. None
// 5. Return result.
default_locale(provider.locale_canonicalizer())
} }
/// Abstract operation [`BestFitMatcher ( availableLocales, requestedLocales )`][spec] /// Abstract operation [`LookupMatchingLocaleByBestFit ( availableLocales, requestedLocales )`][spec]
/// ///
/// Compares `requestedLocales`, which must be a `List` as returned by `CanonicalizeLocaleList`, /// Compares `requestedLocales`, which must be a `List` as returned by `CanonicalizeLocaleList`,
/// against the locales in `availableLocales` and determines the best available language to /// against the locales in `availableLocales` and determines the best available language to
@ -367,31 +258,50 @@ where
/// produced by the `LookupMatcher` abstract operation. /// produced by the `LookupMatcher` abstract operation.
/// ///
/// [spec]: https://tc39.es/ecma402/#sec-bestfitmatcher /// [spec]: https://tc39.es/ecma402/#sec-bestfitmatcher
fn best_fit_matcher<M: KeyedDataMarker>( fn lookup_matching_locale_by_best_fit<M: KeyedDataMarker>(
requested_locales: &[Locale], requested_locales: impl IntoIterator<Item = Locale>,
provider: &IntlProvider, provider: &IntlProvider,
) -> Locale ) -> Option<Locale>
where where
IntlProvider: DataProvider<M>, IntlProvider: DataProvider<M>,
{ {
for mut locale in requested_locales for mut locale in requested_locales {
.iter()
.cloned()
.chain(std::iter::once_with(|| {
default_locale(provider.locale_canonicalizer())
}))
{
let id = std::mem::take(&mut locale.id); let id = std::mem::take(&mut locale.id);
// Only leave unicode extensions when returning the locale.
locale.extensions.transform.clear(); locale.extensions.transform.clear();
locale.extensions.private.clear(); locale.extensions.private.clear();
if let Some(available) = best_locale_for_provider(id, provider) { let Ok(response) = DataProvider::<M>::load(
locale.id = available; provider,
DataRequest {
locale: &DataLocale::from(&id),
metadata: {
let mut md = DataRequestMetadata::default();
md.silent = true;
md
},
},
) else {
continue;
};
return locale; if id == LanguageIdentifier::UND {
return Some(locale);
}
if let Some(id) = response
.metadata
.locale
.map(|dl| dl.into_locale().id)
.or(Some(id))
.filter(|loc| loc != &LanguageIdentifier::UND)
{
locale.id = id;
return Some(locale);
} }
} }
Locale::default() None
} }
/// Abstract operation `ResolveLocale ( availableLocales, requestedLocales, options, relevantExtensionKeys, localeData )` /// Abstract operation `ResolveLocale ( availableLocales, requestedLocales, options, relevantExtensionKeys, localeData )`
@ -406,7 +316,7 @@ where
/// ///
/// [spec]: https://tc39.es/ecma402/#sec-resolvelocale /// [spec]: https://tc39.es/ecma402/#sec-resolvelocale
pub(in crate::builtins::intl) fn resolve_locale<S>( pub(in crate::builtins::intl) fn resolve_locale<S>(
requested_locales: &[Locale], requested_locales: impl IntoIterator<Item = Locale>,
options: &mut IntlOptions<S::LocaleOptions>, options: &mut IntlOptions<S::LocaleOptions>,
provider: &IntlProvider, provider: &IntlProvider,
) -> Locale ) -> Locale
@ -416,15 +326,16 @@ where
{ {
// 1. Let matcher be options.[[localeMatcher]]. // 1. Let matcher be options.[[localeMatcher]].
// 2. If matcher is "lookup", then // 2. If matcher is "lookup", then
// a. Let r be ! LookupMatcher(availableLocales, requestedLocales). // a. Let r be LookupMatchingLocaleByPrefix(availableLocales, requestedLocales).
// 3. Else, // 3. Else,
// a. Let r be ! BestFitMatcher(availableLocales, requestedLocales). // a. Let r be LookupMatchingLocaleByBestFit(availableLocales, requestedLocales).
// 4. Let foundLocale be r.[[locale]]. // 4. If r is undefined, set r to the Record { [[locale]]: DefaultLocale(), [[extension]]: empty }.
let mut found_locale = if options.matcher == LocaleMatcher::Lookup { let mut found_locale = if options.matcher == LocaleMatcher::Lookup {
lookup_matcher::<S::LangMarker>(requested_locales, provider) lookup_matching_locale_by_prefix::<S::LangMarker>(requested_locales, provider)
} else { } else {
best_fit_matcher::<S::LangMarker>(requested_locales, provider) lookup_matching_locale_by_best_fit::<S::LangMarker>(requested_locales, provider)
}; }
.unwrap_or_else(|| default_locale(provider.locale_canonicalizer()));
// From here, the spec differs significantly from the implementation, // From here, the spec differs significantly from the implementation,
// since ICU4X allows us to skip some steps and modularize the // since ICU4X allows us to skip some steps and modularize the
@ -485,62 +396,18 @@ where
found_locale found_locale
} }
/// Abstract operation [`LookupSupportedLocales ( availableLocales, requestedLocales )`][spec] /// Abstract operation [`FilterLocales ( availableLocales, requestedLocales, options )`][spec]
/// ///
/// Returns the subset of the provided BCP 47 language priority list requestedLocales for which /// Returns the subset of the provided BCP 47 language priority list requestedLocales for which
/// `availableLocales` has a matching locale when using the BCP 47 Lookup algorithm. Locales appear /// availableLocales has a matching locale.
/// in the same order in the returned list as in `requestedLocales`.
/// ///
/// # Note /// # Note
/// ///
/// This differs a bit from the spec, since we don't have an `[[AvailableLocales]]` /// Calling this function with a singleton `KeyedDataMarker` will always return `None`.
/// list to compare with. However, we can do data requests to a [`DataProvider`]
/// in order to see if a certain [`Locale`] is supported.
///
/// [spec]: https://tc39.es/ecma402/#sec-lookupsupportedlocales
fn lookup_supported_locales<M: KeyedDataMarker>(
requested_locales: &[Locale],
provider: &(impl DataProvider<M> + ?Sized),
) -> Vec<Locale> {
// 1. Let subset be a new empty List.
// 2. For each element locale of requestedLocales, do
// a. Let noExtensionsLocale be the String value that is locale with any Unicode locale extension sequences removed.
// b. Let availableLocale be ! BestAvailableLocale(availableLocales, noExtensionsLocale).
// c. If availableLocale is not undefined, append locale to the end of subset.
// 3. Return subset.
requested_locales
.iter()
.filter(|loc| best_available_locale(loc.id.clone(), provider).is_some())
.cloned()
.collect()
}
/// Abstract operation [`BestFitSupportedLocales ( availableLocales, requestedLocales )`][spec]
///
/// Returns the subset of the provided BCP 47 language priority list `requestedLocales` for which
/// `availableLocales` has a matching locale when using the Best Fit Matcher algorithm. Locales appear
/// in the same order in the returned list as in requestedLocales.
///
/// [spec]: https://tc39.es/ecma402/#sec-bestfitsupportedlocales
fn best_fit_supported_locales<M: KeyedDataMarker>(
requested_locales: &[Locale],
provider: &(impl DataProvider<M> + ?Sized),
) -> Vec<Locale> {
requested_locales
.iter()
.filter(|loc| best_locale_for_provider(loc.id.clone(), provider).is_some())
.cloned()
.collect()
}
/// Abstract operation [`SupportedLocales ( availableLocales, requestedLocales, options )`][spec]
///
/// Returns the subset of the provided BCP 47 language priority list requestedLocales for which
/// availableLocales has a matching locale
/// ///
/// [spec]: https://tc39.es/ecma402/#sec-supportedlocales /// [spec]: https://tc39.es/ecma402/#sec-supportedlocales
pub(in crate::builtins::intl) fn supported_locales<M: KeyedDataMarker>( pub(in crate::builtins::intl) fn filter_locales<M: KeyedDataMarker>(
requested_locales: &[Locale], requested_locales: Vec<Locale>,
options: &JsValue, options: &JsValue,
context: &mut Context, context: &mut Context,
) -> JsResult<JsObject> ) -> JsResult<JsObject>
@ -553,22 +420,36 @@ where
// 2. Let matcher be ? GetOption(options, "localeMatcher", string, « "lookup", "best fit" », "best fit"). // 2. Let matcher be ? GetOption(options, "localeMatcher", string, « "lookup", "best fit" », "best fit").
let matcher = get_option(&options, js_str!("localeMatcher"), context)?.unwrap_or_default(); let matcher = get_option(&options, js_str!("localeMatcher"), context)?.unwrap_or_default();
let elements = match matcher { // 3. Let subset be a new empty List.
// 4. Else, let mut subset = Vec::with_capacity(requested_locales.len());
// a. Let supportedLocales be LookupSupportedLocales(availableLocales, requestedLocales).
// 4. For each element locale of requestedLocales, do
for locale in requested_locales {
// a. Let noExtensionsLocale be the String value that is locale with any Unicode locale extension sequences removed.
let mut no_ext_loc = locale.clone();
no_ext_loc.extensions.unicode.clear();
let loc_match = match matcher {
// b. If matcher is "lookup", then
// i. Let match be LookupMatchingLocaleByPrefix(availableLocales, noExtensionsLocale).
LocaleMatcher::Lookup => { LocaleMatcher::Lookup => {
lookup_supported_locales(requested_locales, context.intl_provider()) lookup_matching_locale_by_prefix([no_ext_loc], context.intl_provider())
} }
// 3. If matcher is "best fit", then // c. Else,
// a. Let supportedLocales be BestFitSupportedLocales(availableLocales, requestedLocales). // i. Let match be LookupMatchingLocaleByBestFit(availableLocales, noExtensionsLocale).
LocaleMatcher::BestFit => { LocaleMatcher::BestFit => {
best_fit_supported_locales(requested_locales, context.intl_provider()) lookup_matching_locale_by_best_fit([no_ext_loc], context.intl_provider())
} }
}; };
// 5. Return CreateArrayFromList(supportedLocales). // d. If match is not undefined, append locale to subset.
if loc_match.is_some() {
subset.push(locale);
}
}
// 5. Return CreateArrayFromList(subset).
Ok(Array::create_array_from_list( Ok(Array::create_array_from_list(
elements subset
.into_iter() .into_iter()
.map(|loc| js_string!(loc.to_string()).into()), .map(|loc| js_string!(loc.to_string()).into()),
context, context,
@ -577,6 +458,10 @@ where
/// Validates that the unicode extension `key` with `value` is a valid extension value for the /// Validates that the unicode extension `key` with `value` is a valid extension value for the
/// `language`. /// `language`.
///
/// # Note
///
/// Calling this function with a singleton `KeyedDataMarker` will always return `None`.
pub(in crate::builtins::intl) fn validate_extension<M: KeyedDataMarker>( pub(in crate::builtins::intl) fn validate_extension<M: KeyedDataMarker>(
language: LanguageIdentifier, language: LanguageIdentifier,
key: Key, key: Key,
@ -597,54 +482,47 @@ pub(in crate::builtins::intl) fn validate_extension<M: KeyedDataMarker>(
.is_some() .is_some()
} }
#[cfg(test)] #[cfg(all(test, feature = "intl_bundled"))]
mod tests { mod tests {
use icu_locid::{langid, locale, Locale}; use icu_locid::{langid, locale, Locale};
use icu_plurals::provider::CardinalV1Marker; use icu_plurals::provider::CardinalV1Marker;
use icu_provider::AsDeserializingBufferProvider;
use crate::{ use crate::{
builtins::intl::locale::utils::{ builtins::intl::locale::utils::{
best_available_locale, best_fit_matcher, default_locale, lookup_matcher, lookup_matching_locale_by_best_fit, lookup_matching_locale_by_prefix,
}, },
context::icu::IntlProvider, context::icu::IntlProvider,
}; };
#[test] #[test]
fn best_avail_loc() { fn best_fit() {
let provider = boa_icu_provider::buffer(); let icu = &IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer()).unwrap();
let provider = provider.as_deserializing();
assert_eq!( assert_eq!(
best_available_locale::<CardinalV1Marker>(langid!("en"), &provider), lookup_matching_locale_by_best_fit::<CardinalV1Marker>([locale!("en")], icu),
Some(langid!("en")) Some(locale!("en"))
); );
assert_eq!( assert_eq!(
best_available_locale::<CardinalV1Marker>(langid!("es-ES"), &provider), lookup_matching_locale_by_best_fit::<CardinalV1Marker>([locale!("es-ES")], icu),
Some(langid!("es")) Some(locale!("es"))
); );
assert_eq!( assert_eq!(
best_available_locale::<CardinalV1Marker>(langid!("kr"), &provider), lookup_matching_locale_by_best_fit::<CardinalV1Marker>([locale!("kr")], icu),
None None
); );
} }
#[test] #[test]
fn lookup_match() { fn lookup_match() {
let icu = IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer()).unwrap(); let icu = &IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer()).unwrap();
// requested: []
let res = lookup_matcher::<CardinalV1Marker>(&[], &icu);
assert_eq!(res, default_locale(icu.locale_canonicalizer()));
assert!(res.extensions.is_empty());
// requested: [fr-FR-u-hc-h12] // requested: [fr-FR-u-hc-h12]
let requested: Locale = "fr-FR-u-hc-h12".parse().unwrap(); let requested: Locale = "fr-FR-u-hc-h12".parse().unwrap();
let result = lookup_matcher::<CardinalV1Marker>(&[requested.clone()], &icu); let result =
lookup_matching_locale_by_prefix::<CardinalV1Marker>([requested.clone()], icu).unwrap();
assert_eq!(result.id, langid!("fr")); assert_eq!(result.id, langid!("fr"));
assert_eq!(result.extensions, requested.extensions); assert_eq!(result.extensions, requested.extensions);
@ -655,7 +533,7 @@ mod tests {
let uz = locale!("uz-Cyrl"); let uz = locale!("uz-Cyrl");
let requested = vec![kr, gr, es.clone(), uz]; let requested = vec![kr, gr, es.clone(), uz];
let res = best_fit_matcher::<CardinalV1Marker>(&requested, &icu); let res = lookup_matching_locale_by_best_fit::<CardinalV1Marker>(requested, icu).unwrap();
assert_eq!(res.id, langid!("es")); assert_eq!(res.id, langid!("es"));
assert_eq!(res.extensions, es.extensions); assert_eq!(res.extensions, es.extensions);
} }

10
core/engine/src/builtins/intl/mod.rs

@ -28,6 +28,7 @@ use crate::{
use boa_gc::{Finalize, Trace}; use boa_gc::{Finalize, Trace};
use boa_profiler::Profiler; use boa_profiler::Profiler;
use icu_provider::KeyedDataMarker; use icu_provider::KeyedDataMarker;
use static_assertions::const_assert;
pub(crate) mod collator; pub(crate) mod collator;
pub(crate) mod date_time_format; pub(crate) mod date_time_format;
@ -44,6 +45,15 @@ pub(crate) use self::{
mod options; mod options;
// No singletons are allowed as lang markers.
// Hopefully, we'll be able to migrate this to the definition of `Service` in the future
// (https://github.com/rust-lang/rust/issues/76560)
const_assert! {!<Collator as Service>::LangMarker::KEY.metadata().singleton}
const_assert! {!<ListFormat as Service>::LangMarker::KEY.metadata().singleton}
const_assert! {!<NumberFormat as Service>::LangMarker::KEY.metadata().singleton}
const_assert! {!<PluralRules as Service>::LangMarker::KEY.metadata().singleton}
const_assert! {!<Segmenter as Service>::LangMarker::KEY.metadata().singleton}
/// JavaScript `Intl` object. /// JavaScript `Intl` object.
#[derive(Debug, Clone, Trace, Finalize, JsData)] #[derive(Debug, Clone, Trace, Finalize, JsData)]
#[boa_gc(unsafe_empty_trace)] #[boa_gc(unsafe_empty_trace)]

8
core/engine/src/builtins/intl/number_format/mod.rs

@ -39,7 +39,7 @@ use crate::{
}; };
use super::{ use super::{
locale::{canonicalize_locale_list, resolve_locale, supported_locales, validate_extension}, locale::{canonicalize_locale_list, filter_locales, resolve_locale, validate_extension},
options::{coerce_options_to_object, IntlOptions}, options::{coerce_options_to_object, IntlOptions},
Service, Service,
}; };
@ -237,7 +237,7 @@ impl BuiltInConstructor for NumberFormat {
// 9. Let localeData be %Intl.NumberFormat%.[[LocaleData]]. // 9. Let localeData be %Intl.NumberFormat%.[[LocaleData]].
// 10. Let r be ResolveLocale(%Intl.NumberFormat%.[[AvailableLocales]], requestedLocales, opt, %Intl.NumberFormat%.[[RelevantExtensionKeys]], localeData). // 10. Let r be ResolveLocale(%Intl.NumberFormat%.[[AvailableLocales]], requestedLocales, opt, %Intl.NumberFormat%.[[RelevantExtensionKeys]], localeData).
let locale = resolve_locale::<Self>( let locale = resolve_locale::<Self>(
&requested_locales, requested_locales,
&mut intl_options, &mut intl_options,
context.intl_provider(), context.intl_provider(),
); );
@ -465,8 +465,8 @@ impl NumberFormat {
// 2. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 2. Let requestedLocales be ? CanonicalizeLocaleList(locales).
let requested_locales = canonicalize_locale_list(locales, context)?; let requested_locales = canonicalize_locale_list(locales, context)?;
// 3. Return ? SupportedLocales(availableLocales, requestedLocales, options). // 3. Return ? FilterLocales(availableLocales, requestedLocales, options).
supported_locales::<<Self as Service>::LangMarker>(&requested_locales, options, context) filter_locales::<<Self as Service>::LangMarker>(requested_locales, options, context)
.map(JsValue::from) .map(JsValue::from)
} }

8
core/engine/src/builtins/intl/plural_rules/mod.rs

@ -26,7 +26,7 @@ use crate::{
}; };
use super::{ use super::{
locale::{canonicalize_locale_list, resolve_locale, supported_locales}, locale::{canonicalize_locale_list, filter_locales, resolve_locale},
number_format::{DigitFormatOptions, Extrema, NotationKind}, number_format::{DigitFormatOptions, Extrema, NotationKind},
options::{coerce_options_to_object, IntlOptions}, options::{coerce_options_to_object, IntlOptions},
Service, Service,
@ -136,7 +136,7 @@ impl BuiltInConstructor for PluralRules {
// 10. Let r be ResolveLocale(%PluralRules%.[[AvailableLocales]], requestedLocales, opt, %PluralRules%.[[RelevantExtensionKeys]], localeData). // 10. Let r be ResolveLocale(%PluralRules%.[[AvailableLocales]], requestedLocales, opt, %PluralRules%.[[RelevantExtensionKeys]], localeData).
// 11. Set pluralRules.[[Locale]] to r.[[locale]]. // 11. Set pluralRules.[[Locale]] to r.[[locale]].
let locale = resolve_locale::<Self>( let locale = resolve_locale::<Self>(
&requested_locales, requested_locales,
&mut IntlOptions { &mut IntlOptions {
matcher, matcher,
..Default::default() ..Default::default()
@ -292,8 +292,8 @@ impl PluralRules {
// 2. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 2. Let requestedLocales be ? CanonicalizeLocaleList(locales).
let requested_locales = canonicalize_locale_list(locales, context)?; let requested_locales = canonicalize_locale_list(locales, context)?;
// 3. Return ? SupportedLocales(availableLocales, requestedLocales, options). // 3. Return ? FilterLocales(availableLocales, requestedLocales, options).
supported_locales::<<Self as Service>::LangMarker>(&requested_locales, options, context) filter_locales::<<Self as Service>::LangMarker>(requested_locales, options, context)
.map(JsValue::from) .map(JsValue::from)
} }

19
core/engine/src/builtins/intl/segmenter/mod.rs

@ -3,10 +3,9 @@ use std::ops::Range;
use boa_gc::{Finalize, Trace}; use boa_gc::{Finalize, Trace};
use boa_macros::js_str; use boa_macros::js_str;
use boa_profiler::Profiler; use boa_profiler::Profiler;
use icu_collator::provider::CollationDiacriticsV1Marker;
use icu_locid::Locale; use icu_locid::Locale;
use icu_segmenter::{ use icu_segmenter::{GraphemeClusterSegmenter, SentenceSegmenter, WordSegmenter};
provider::WordBreakDataV1Marker, GraphemeClusterSegmenter, SentenceSegmenter, WordSegmenter,
};
use crate::{ use crate::{
builtins::{ builtins::{
@ -30,7 +29,7 @@ pub(crate) use options::*;
pub(crate) use segments::*; pub(crate) use segments::*;
use super::{ use super::{
locale::{canonicalize_locale_list, resolve_locale, supported_locales}, locale::{canonicalize_locale_list, filter_locales, resolve_locale},
options::IntlOptions, options::IntlOptions,
Service, Service,
}; };
@ -79,7 +78,9 @@ impl NativeSegmenter {
} }
impl Service for Segmenter { impl Service for Segmenter {
type LangMarker = WordBreakDataV1Marker; // TODO: Track https://github.com/unicode-org/icu4x/issues/3284
// and replace when segmenters are locale-aware.
type LangMarker = CollationDiacriticsV1Marker;
type LocaleOptions = (); type LocaleOptions = ();
} }
@ -134,7 +135,7 @@ impl BuiltInConstructor for Segmenter {
let options = args.get_or_undefined(1); let options = args.get_or_undefined(1);
// 4. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 4. Let requestedLocales be ? CanonicalizeLocaleList(locales).
let locales = canonicalize_locale_list(locales, context)?; let requested_locales = canonicalize_locale_list(locales, context)?;
// 5. Set options to ? GetOptionsObject(options). // 5. Set options to ? GetOptionsObject(options).
let options = get_options_object(options)?; let options = get_options_object(options)?;
@ -148,7 +149,7 @@ impl BuiltInConstructor for Segmenter {
// 10. Let r be ResolveLocale(%Segmenter%.[[AvailableLocales]], requestedLocales, opt, %Segmenter%.[[RelevantExtensionKeys]], localeData). // 10. Let r be ResolveLocale(%Segmenter%.[[AvailableLocales]], requestedLocales, opt, %Segmenter%.[[RelevantExtensionKeys]], localeData).
// 11. Set segmenter.[[Locale]] to r.[[locale]]. // 11. Set segmenter.[[Locale]] to r.[[locale]].
let locale = resolve_locale::<Self>( let locale = resolve_locale::<Self>(
&locales, requested_locales,
&mut IntlOptions { &mut IntlOptions {
matcher, matcher,
..Default::default() ..Default::default()
@ -214,8 +215,8 @@ impl Segmenter {
// 2. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 2. Let requestedLocales be ? CanonicalizeLocaleList(locales).
let requested_locales = canonicalize_locale_list(locales, context)?; let requested_locales = canonicalize_locale_list(locales, context)?;
// 3. Return ? SupportedLocales(availableLocales, requestedLocales, options). // 3. Return ? FilterLocales(availableLocales, requestedLocales, options).
supported_locales::<<Self as Service>::LangMarker>(&requested_locales, options, context) filter_locales::<<Self as Service>::LangMarker>(requested_locales, options, context)
.map(JsValue::from) .map(JsValue::from)
} }

34
core/engine/src/builtins/string/mod.rs

@ -1743,10 +1743,12 @@ impl String {
#[cfg(feature = "intl")] #[cfg(feature = "intl")]
{ {
use super::intl::locale::{ use super::intl::locale::{
best_available_locale, canonicalize_locale_list, default_locale, canonicalize_locale_list, default_locale, lookup_matching_locale_by_prefix,
}; };
use icu_casemap::provider::CaseMapV1Marker; // TODO: Small hack to make lookups behave.
use icu_locid::LanguageIdentifier; // We would really like to be able to use `icu_casemap::provider::CaseMapV1Marker`
use icu_locid::Locale;
use icu_plurals::provider::OrdinalV1Marker;
// 1. Let O be ? RequireObjectCoercible(this value). // 1. Let O be ? RequireObjectCoercible(this value).
let this = this.require_object_coercible()?; let this = this.require_object_coercible()?;
@ -1762,19 +1764,25 @@ impl String {
// 1. Let requestedLocales be ? CanonicalizeLocaleList(locales). // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
// 2. If requestedLocales is not an empty List, then // 2. If requestedLocales is not an empty List, then
// a. Let requestedLocale be requestedLocales[0]. // a. Let requestedLocale be requestedLocales[0].
let lang = canonicalize_locale_list(args.get_or_undefined(0), context)? let mut requested_locale = canonicalize_locale_list(args.get_or_undefined(0), context)?
.into_iter() .into_iter()
.next() .next()
// 3. Else, // 3. Else,
// a. Let requestedLocale be ! DefaultLocale(). // a. Let requestedLocale be ! DefaultLocale().
.unwrap_or_else(|| default_locale(context.intl_provider().locale_canonicalizer())) .unwrap_or_else(|| default_locale(context.intl_provider().locale_canonicalizer()));
.id;
// 4. Let noExtensionsLocale be the String value that is requestedLocale with any Unicode locale extension sequences (6.2.1) removed. // 4. Let noExtensionsLocale be the String value that is requestedLocale with any Unicode locale extension sequences (6.2.1) removed.
// 5. Let availableLocales be a List with language tags that includes the languages for which the Unicode Character Database contains language sensitive case mappings. Implementations may add additional language tags if they support case mapping for additional locales. requested_locale.extensions.unicode.clear();
// 6. Let locale be ! BestAvailableLocale(availableLocales, noExtensionsLocale).
// 7. If locale is undefined, set locale to "und". // 5. Let availableLocales be a List with language tags that includes the languages for which the Unicode
let lang = best_available_locale::<CaseMapV1Marker>(lang, context.intl_provider()) // Character Database contains language sensitive case mappings. Implementations may add additional
.unwrap_or(LanguageIdentifier::UND); // language tags if they support case mapping for additional locales.
// 6. Let match be LookupMatchingLocaleByPrefix(availableLocales, noExtensionsLocale).
// 7. If match is not undefined, let locale be match.[[locale]]; else let locale be "und".
let locale = lookup_matching_locale_by_prefix::<OrdinalV1Marker>(
[requested_locale],
context.intl_provider(),
)
.unwrap_or(Locale::UND);
let casemapper = context.intl_provider().case_mapper(); let casemapper = context.intl_provider().case_mapper();
@ -1784,11 +1792,11 @@ impl String {
// 10. Else, // 10. Else,
// a. Assert: targetCase is upper. // a. Assert: targetCase is upper.
// b. Let newCodePoints be a List whose elements are the result of an uppercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm. // b. Let newCodePoints be a List whose elements are the result of an uppercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm.
casemapper.uppercase_to_string(&segment, &lang) casemapper.uppercase_to_string(&segment, &locale.id)
} else { } else {
// 9. If targetCase is lower, then // 9. If targetCase is lower, then
// a. Let newCodePoints be a List whose elements are the result of a lowercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm. // a. Let newCodePoints be a List whose elements are the result of a lowercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm.
casemapper.lowercase_to_string(&segment, &lang) casemapper.lowercase_to_string(&segment, &locale.id)
} }
}); });

22
core/engine/src/context/mod.rs

@ -940,6 +940,14 @@ impl ContextBuilder {
/// ///
/// This function is only available if the `intl` feature is enabled. /// This function is only available if the `intl` feature is enabled.
/// ///
/// # Additional considerations
///
/// If the data was generated using `icu_datagen`, make sure that the deduplication strategy is
/// not set to [`Maximal`]. Otherwise, `icu_datagen` will delete base locales such as "en" from
/// the list of supported locales if the required data for "en" is the same as "und".
/// We recommend [`RetainBaseLanguages`] as a nice default, which will only deduplicate locales
/// if the deduplication target is not "und".
///
/// # Errors /// # Errors
/// ///
/// This returns `Err` if the provided provider doesn't have the required locale information /// This returns `Err` if the provided provider doesn't have the required locale information
@ -947,6 +955,9 @@ impl ContextBuilder {
/// mean that the provider will successfully construct all `Intl` services; that check is made /// mean that the provider will successfully construct all `Intl` services; that check is made
/// until the creation of an instance of a service. /// until the creation of an instance of a service.
/// ///
/// [`Maximal`]: https://docs.rs/icu_datagen/latest/icu_datagen/enum.DeduplicationStrategy.html#variant.Maximal
/// [`RetainBaseLanguages`]: https://docs.rs/icu_datagen/latest/icu_datagen/enum.DeduplicationStrategy.html#variant.RetainBaseLanguages
/// [`ResolveLocale`]: https://tc39.es/ecma402/#sec-resolvelocale
/// [`LocaleCanonicalizer`]: icu_locid_transform::LocaleCanonicalizer /// [`LocaleCanonicalizer`]: icu_locid_transform::LocaleCanonicalizer
/// [`LocaleExpander`]: icu_locid_transform::LocaleExpander /// [`LocaleExpander`]: icu_locid_transform::LocaleExpander
/// [`BufferProvider`]: icu_provider::BufferProvider /// [`BufferProvider`]: icu_provider::BufferProvider
@ -963,6 +974,14 @@ impl ContextBuilder {
/// ///
/// This function is only available if the `intl` feature is enabled. /// This function is only available if the `intl` feature is enabled.
/// ///
/// # Additional considerations
///
/// If the data was generated using `icu_datagen`, make sure that the deduplication strategy is
/// not set to [`Maximal`]. Otherwise, `icu_datagen` will delete base locales such as "en" from
/// the list of supported locales if the required data for "en" is the same as "und".
/// We recommend [`RetainBaseLanguages`] as a nice default, which will only deduplicate locales
/// if the deduplication target is not "und".
///
/// # Errors /// # Errors
/// ///
/// This returns `Err` if the provided provider doesn't have the required locale information /// This returns `Err` if the provided provider doesn't have the required locale information
@ -970,6 +989,9 @@ impl ContextBuilder {
/// mean that the provider will successfully construct all `Intl` services; that check is made /// mean that the provider will successfully construct all `Intl` services; that check is made
/// until the creation of an instance of a service. /// until the creation of an instance of a service.
/// ///
/// [`Maximal`]: https://docs.rs/icu_datagen/latest/icu_datagen/enum.DeduplicationStrategy.html#variant.Maximal
/// [`RetainBaseLanguages`]: https://docs.rs/icu_datagen/latest/icu_datagen/enum.DeduplicationStrategy.html#variant.RetainBaseLanguages
/// [`ResolveLocale`]: https://tc39.es/ecma402/#sec-resolvelocale
/// [`LocaleCanonicalizer`]: icu_locid_transform::LocaleCanonicalizer /// [`LocaleCanonicalizer`]: icu_locid_transform::LocaleCanonicalizer
/// [`LocaleExpander`]: icu_locid_transform::LocaleExpander /// [`LocaleExpander`]: icu_locid_transform::LocaleExpander
/// [`AnyProvider`]: icu_provider::AnyProvider /// [`AnyProvider`]: icu_provider::AnyProvider

BIN
core/icu_provider/data/icudata.postcard

Binary file not shown.

2
test262_config.toml

@ -1,4 +1,4 @@
commit = "b73f7d662d51584bfee6d3ed274b676d313b646a" commit = "c00830acef42bdb0e917b5fdec76ed9d399c0eea"
[ignored] [ignored]
# Not implemented yet: # Not implemented yet:

4
tests/tester/src/edition.rs

@ -81,6 +81,10 @@ static FEATURE_EDITION: phf::Map<&'static str, SpecEdition> = phf::phf_map! {
// https://github.com/tc39/proposal-iterator-helpers // https://github.com/tc39/proposal-iterator-helpers
"iterator-helpers" => SpecEdition::ESNext, "iterator-helpers" => SpecEdition::ESNext,
// Promise.try
// https://github.com/tc39/proposal-promise-try
"promise-try" => SpecEdition::ESNext,
// Set methods // Set methods
// https://github.com/tc39/proposal-set-methods // https://github.com/tc39/proposal-set-methods
"set-methods" => SpecEdition::ESNext, "set-methods" => SpecEdition::ESNext,

18
tools/gen-icu4x-data/Cargo.toml

@ -10,9 +10,15 @@ license.workspace = true
description.workspace = true description.workspace = true
[dependencies] [dependencies]
icu_provider = { workspace = true, features = ["datagen"] } icu_provider.workspace = true
icu_provider_blob = { workspace = true, features = ["export"] } icu_datagen = { workspace = true, features = [
icu_datagen = { workspace = true, features = ["networking", "use_wasm"] } "networking",
"use_wasm",
"provider",
"blob_exporter",
"experimental_components",
"rayon",
] }
log.workspace = true log.workspace = true
simple_logger.workspace = true simple_logger.workspace = true
@ -28,12 +34,6 @@ icu_normalizer = { workspace = true, features = ["datagen"] }
icu_plurals = { workspace = true, features = ["datagen", "experimental"] } icu_plurals = { workspace = true, features = ["datagen", "experimental"] }
icu_segmenter = { workspace = true, features = ["datagen"] } icu_segmenter = { workspace = true, features = ["datagen"] }
[target.'cfg(windows)'.dependencies]
# wasmer-wasi apparently has a wrong deps config...
# This dep patches that.
winapi = { workspace = true, features = ["sysinfoapi"] }
[lints] [lints]
workspace = true workspace = true

124
tools/gen-icu4x-data/src/main.rs

@ -2,86 +2,18 @@
use std::{error::Error, fs::File, path::Path}; use std::{error::Error, fs::File, path::Path};
use icu_datagen::{CoverageLevel, DatagenDriver, DatagenProvider}; use icu_datagen::blob_exporter::BlobExporter;
use icu_plurals::provider::{PluralRangesV1, PluralRangesV1Marker}; use icu_datagen::prelude::*;
use icu_provider::{ use icu_provider::data_key;
datagen::{ExportMarker, IterableDynamicDataProvider},
dynutil::UpcastDataPayload,
prelude::*,
};
use icu_provider_blob::export::BlobExporter;
#[cfg(target_os = "windows")] // wasmer-wasi is a really fun dependency to maintain :)
use winapi as _;
/// Hack that associates the `und` locale with an empty plural ranges data.
/// This enables the default behaviour for all locales without data.
#[derive(Debug)]
struct PluralRangesFallbackHack(DatagenProvider);
// We definitely don't want to import dependencies just to do `T::default`.
#[allow(clippy::default_trait_access)]
impl DynamicDataProvider<AnyMarker> for PluralRangesFallbackHack {
fn load_data(
&self,
key: DataKey,
req: DataRequest<'_>,
) -> Result<DataResponse<AnyMarker>, DataError> {
if req.locale.is_und() && key.hashed() == PluralRangesV1Marker::KEY.hashed() {
let payload = <AnyMarker as UpcastDataPayload<PluralRangesV1Marker>>::upcast(
DataPayload::from_owned(PluralRangesV1 {
ranges: Default::default(),
}),
);
Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(payload),
})
} else {
self.0.load_data(key, req)
}
}
}
#[allow(clippy::default_trait_access)] const KEYS_LEN: usize = 129;
impl DynamicDataProvider<ExportMarker> for PluralRangesFallbackHack {
fn load_data(
&self,
key: DataKey,
req: DataRequest<'_>,
) -> Result<DataResponse<ExportMarker>, DataError> {
if req.locale.is_und() && key.hashed() == PluralRangesV1Marker::KEY.hashed() {
let payload = <ExportMarker as UpcastDataPayload<PluralRangesV1Marker>>::upcast(
DataPayload::from_owned(PluralRangesV1 {
ranges: Default::default(),
}),
);
Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(payload),
})
} else {
self.0.load_data(key, req)
}
}
}
impl IterableDynamicDataProvider<ExportMarker> for PluralRangesFallbackHack {
fn supported_locales_for_key(&self, key: DataKey) -> Result<Vec<DataLocale>, DataError> {
if key.hashed() == PluralRangesV1Marker::KEY.hashed() {
let mut locales = self.0.supported_locales_for_key(key)?;
locales.push(DataLocale::default());
Ok(locales)
} else {
self.0.supported_locales_for_key(key)
}
}
}
/// List of keys used by `Intl` components. /// List of keys used by `Intl` components.
/// ///
/// This must be kept in sync with the list of implemented components of `Intl`. /// This must be kept in sync with the list of implemented components of `Intl`.
const KEYS: [&[DataKey]; 9] = [ const KEYS: [DataKey; KEYS_LEN] = {
const CENTINEL_KEY: DataKey = data_key!("centinel@1");
const SERVICES: [&[DataKey]; 9] = [
icu_casemap::provider::KEYS, icu_casemap::provider::KEYS,
icu_collator::provider::KEYS, icu_collator::provider::KEYS,
icu_datetime::provider::KEYS, icu_datetime::provider::KEYS,
@ -91,7 +23,29 @@ const KEYS: [&[DataKey]; 9] = [
icu_normalizer::provider::KEYS, icu_normalizer::provider::KEYS,
icu_plurals::provider::KEYS, icu_plurals::provider::KEYS,
icu_segmenter::provider::KEYS, icu_segmenter::provider::KEYS,
]; ];
let mut array = [CENTINEL_KEY; KEYS_LEN];
let mut offset = 0;
let mut service_idx = 0;
while service_idx < SERVICES.len() {
let service = SERVICES[service_idx];
let mut idx = 0;
while idx < service.len() {
array[offset + idx] = service[idx];
idx += 1;
}
offset += service.len();
service_idx += 1;
}
assert!(offset == array.len());
array
};
fn main() -> Result<(), Box<dyn Error>> { fn main() -> Result<(), Box<dyn Error>> {
simple_logger::SimpleLogger::new() simple_logger::SimpleLogger::new()
@ -106,18 +60,26 @@ fn main() -> Result<(), Box<dyn Error>> {
let _unused = std::fs::remove_dir_all(path); let _unused = std::fs::remove_dir_all(path);
std::fs::create_dir_all(path)?; std::fs::create_dir_all(path)?;
log::info!("Generating ICU4X data for keys: {:?}", KEYS); log::info!("Generating ICU4X data for keys: {:#?}", KEYS);
let provider = DatagenProvider::new_latest_tested(); let provider = DatagenProvider::new_latest_tested();
let locales = provider
.locales_for_coverage_levels([CoverageLevel::Modern])?
.into_iter()
.chain([langid!("en-US")]);
DatagenDriver::new() DatagenDriver::new()
.with_keys(KEYS.into_iter().flatten().copied()) .with_keys(KEYS)
.with_locales(provider.locales_for_coverage_levels([CoverageLevel::Modern])?) .with_locales_and_fallback(locales.map(LocaleFamily::with_descendants), {
let mut options = FallbackOptions::default();
options.deduplication_strategy = Some(DeduplicationStrategy::None);
options
})
.with_additional_collations([String::from("search*")]) .with_additional_collations([String::from("search*")])
.with_recommended_segmenter_models() .with_recommended_segmenter_models()
.export( .export(
&PluralRangesFallbackHack(provider), &provider,
BlobExporter::new_with_sink(Box::new(File::create(path.join("icudata.postcard"))?)), BlobExporter::new_v2_with_sink(Box::new(File::create(path.join("icudata.postcard"))?)),
)?; )?;
Ok(()) Ok(())

Loading…
Cancel
Save