Browse Source

Split default icu data into lazily deserialized parts (#3948)

* Split default icu data into lazily deserialized parts

* FIx no_std compilation

* Lazily load more ICU tools

* Fix regressions and use more stable constructors
pull/3798/merge
José Julián Espina 3 months ago committed by GitHub
parent
commit
00f8e00492
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 11
      Cargo.lock
  2. 33
      core/engine/src/builtins/intl/collator/mod.rs
  3. 44
      core/engine/src/builtins/intl/list_format/mod.rs
  4. 16
      core/engine/src/builtins/intl/locale/mod.rs
  5. 11
      core/engine/src/builtins/intl/locale/tests.rs
  6. 23
      core/engine/src/builtins/intl/locale/utils.rs
  7. 30
      core/engine/src/builtins/intl/number_format/mod.rs
  8. 28
      core/engine/src/builtins/intl/plural_rules/mod.rs
  9. 41
      core/engine/src/builtins/intl/segmenter/mod.rs
  10. 19
      core/engine/src/builtins/string/mod.rs
  11. 123
      core/engine/src/context/icu.rs
  12. 5
      core/engine/src/context/mod.rs
  13. 10
      core/icu_provider/Cargo.toml
  14. BIN
      core/icu_provider/data/icu_casemap.postcard
  15. BIN
      core/icu_provider/data/icu_collator.postcard
  16. BIN
      core/icu_provider/data/icu_datetime.postcard
  17. BIN
      core/icu_provider/data/icu_decimal.postcard
  18. BIN
      core/icu_provider/data/icu_list.postcard
  19. BIN
      core/icu_provider/data/icu_locid_transform.postcard
  20. BIN
      core/icu_provider/data/icu_normalizer.postcard
  21. BIN
      core/icu_provider/data/icu_plurals.postcard
  22. BIN
      core/icu_provider/data/icu_segmenter.postcard
  23. 97
      core/icu_provider/src/lib.rs
  24. 1
      tools/gen-icu4x-data/Cargo.toml
  25. 100
      tools/gen-icu4x-data/src/main.rs

11
Cargo.lock generated

@ -443,10 +443,20 @@ dependencies = [
name = "boa_icu_provider"
version = "0.19.0"
dependencies = [
"icu_casemap",
"icu_collator",
"icu_datetime",
"icu_decimal",
"icu_list",
"icu_locid_transform",
"icu_normalizer",
"icu_plurals",
"icu_provider",
"icu_provider_adapters",
"icu_provider_blob",
"icu_segmenter",
"once_cell",
"paste",
]
[[package]]
@ -1364,7 +1374,6 @@ dependencies = [
"icu_locid_transform",
"icu_normalizer",
"icu_plurals",
"icu_provider",
"icu_segmenter",
"log",
"simple_logger",

33
core/engine/src/builtins/intl/collator/mod.rs

@ -17,7 +17,7 @@ use crate::{
OrdinaryObject,
},
context::{
icu::IntlProvider,
icu::{ErasedProvider, IntlProvider},
intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
},
js_string,
@ -279,7 +279,7 @@ impl BuiltInConstructor for Collator {
requested_locales,
&mut intl_options,
context.intl_provider(),
);
)?;
let collator_locale = {
// `collator_locale` needs to be different from the resolved locale because ECMA402 doesn't
@ -335,18 +335,23 @@ impl BuiltInConstructor for Collator {
.then_some((AlternateHandling::Shifted, MaxVariable::Punctuation))
.unzip();
let collator =
icu_collator::Collator::try_new_unstable(context.intl_provider(), &collator_locale, {
let mut options = icu_collator::CollatorOptions::new();
options.strength = strength;
options.case_level = case_level;
options.case_first = case_first;
options.numeric = Some(if numeric { Numeric::On } else { Numeric::Off });
options.alternate_handling = alternate_handling;
options.max_variable = max_variable;
options
})
.map_err(|e| JsNativeError::typ().with_message(e.to_string()))?;
let mut options = icu_collator::CollatorOptions::new();
options.strength = strength;
options.case_level = case_level;
options.case_first = case_first;
options.numeric = Some(if numeric { Numeric::On } else { Numeric::Off });
options.alternate_handling = alternate_handling;
options.max_variable = max_variable;
let collator = match context.intl_provider().erased_provider() {
ErasedProvider::Any(a) => {
icu_collator::Collator::try_new_with_any_provider(a, &collator_locale, options)
}
ErasedProvider::Buffer(b) => {
icu_collator::Collator::try_new_with_buffer_provider(b, &collator_locale, options)
}
}
.map_err(|e| JsNativeError::typ().with_message(e.to_string()))?;
let prototype =
get_prototype_from_constructor(new_target, StandardConstructors::collator, context)?;

44
core/engine/src/builtins/intl/list_format/mod.rs

@ -12,7 +12,10 @@ use crate::{
options::{get_option, get_options_object},
Array, BuiltInBuilder, BuiltInConstructor, BuiltInObject, IntrinsicObject, OrdinaryObject,
},
context::intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
context::{
icu::ErasedProvider,
intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
},
js_string,
object::{internal_methods::get_prototype_from_constructor, JsObject},
property::Attribute,
@ -128,7 +131,7 @@ impl BuiltInConstructor for ListFormat {
..Default::default()
},
context.intl_provider(),
);
)?;
// 11. Let type be ? GetOption(options, "type", string, « "conjunction", "disjunction", "unit" », "conjunction").
// 12. Set listFormat.[[Type]] to type.
@ -142,23 +145,26 @@ impl BuiltInConstructor for ListFormat {
// 16. Let dataLocaleData be localeData.[[<dataLocale>]].
// 17. Let dataLocaleTypes be dataLocaleData.[[<type>]].
// 18. Set listFormat.[[Templates]] to dataLocaleTypes.[[<style>]].
let data_locale = DataLocale::from(&locale);
let formatter = match typ {
ListFormatType::Conjunction => ListFormatter::try_new_and_with_length_unstable(
context.intl_provider(),
&data_locale,
style,
),
ListFormatType::Disjunction => ListFormatter::try_new_or_with_length_unstable(
context.intl_provider(),
&data_locale,
style,
),
ListFormatType::Unit => ListFormatter::try_new_unit_with_length_unstable(
context.intl_provider(),
&data_locale,
style,
),
let data_locale = &DataLocale::from(&locale);
let formatter = match (typ, context.intl_provider().erased_provider()) {
(ListFormatType::Conjunction, ErasedProvider::Any(a)) => {
ListFormatter::try_new_and_with_length_with_any_provider(a, data_locale, style)
}
(ListFormatType::Disjunction, ErasedProvider::Any(a)) => {
ListFormatter::try_new_or_with_length_with_any_provider(a, data_locale, style)
}
(ListFormatType::Unit, ErasedProvider::Any(a)) => {
ListFormatter::try_new_unit_with_length_with_any_provider(a, data_locale, style)
}
(ListFormatType::Conjunction, ErasedProvider::Buffer(b)) => {
ListFormatter::try_new_and_with_length_with_buffer_provider(b, data_locale, style)
}
(ListFormatType::Disjunction, ErasedProvider::Buffer(b)) => {
ListFormatter::try_new_or_with_length_with_buffer_provider(b, data_locale, style)
}
(ListFormatType::Unit, ErasedProvider::Buffer(b)) => {
ListFormatter::try_new_unit_with_length_with_buffer_provider(b, data_locale, style)
}
}
.map_err(|e| JsNativeError::typ().with_message(e.to_string()))?;

16
core/engine/src/builtins/intl/locale/mod.rs

@ -248,7 +248,7 @@ impl BuiltInConstructor for Locale {
// 10. Set tag to ! CanonicalizeUnicodeLocaleId(tag).
context
.intl_provider()
.locale_canonicalizer()
.locale_canonicalizer()?
.canonicalize(&mut tag);
// Skipping some boilerplate since this is easier to do using the `Locale` type, but putting the
@ -282,7 +282,7 @@ impl BuiltInConstructor for Locale {
// 17. Return ! CanonicalizeUnicodeLocaleId(tag).
context
.intl_provider()
.locale_canonicalizer()
.locale_canonicalizer()?
.canonicalize(&mut tag);
}
@ -368,7 +368,7 @@ impl BuiltInConstructor for Locale {
context
.intl_provider()
.locale_canonicalizer()
.locale_canonicalizer()?
.canonicalize(&mut tag);
// 6. Let locale be ? OrdinaryCreateFromConstructor(NewTarget, "%Locale.prototype%", internalSlotsList).
@ -409,7 +409,10 @@ impl Locale {
.clone();
// 3. Let maximal be the result of the Add Likely Subtags algorithm applied to loc.[[Locale]]. If an error is signaled, set maximal to loc.[[Locale]].
context.intl_provider().locale_expander().maximize(&mut loc);
context
.intl_provider()
.locale_expander()?
.maximize(&mut loc);
// 4. Return ! Construct(%Locale%, maximal).
let prototype = context.intrinsics().constructors().locale().prototype();
@ -445,7 +448,10 @@ impl Locale {
.clone();
// 3. Let minimal be the result of the Remove Likely Subtags algorithm applied to loc.[[Locale]]. If an error is signaled, set minimal to loc.[[Locale]].
context.intl_provider().locale_expander().minimize(&mut loc);
context
.intl_provider()
.locale_expander()?
.minimize(&mut loc);
// 4. Return ! Construct(%Locale%, minimal).
let prototype = context.intrinsics().constructors().locale().prototype();

11
core/engine/src/builtins/intl/locale/tests.rs

@ -73,8 +73,8 @@ impl Service for TestService {
#[test]
fn locale_resolution() {
let provider = IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer()).unwrap();
let mut default = default_locale(provider.locale_canonicalizer());
let provider = IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer());
let mut default = default_locale(provider.locale_canonicalizer().unwrap());
default
.extensions
.unicode
@ -88,7 +88,7 @@ fn locale_resolution() {
hc: Some(HourCycle::H11),
},
};
let locale = resolve_locale::<TestService>([], &mut options, &provider);
let locale = resolve_locale::<TestService>([], &mut options, &provider).unwrap();
assert_eq!(locale, default);
// test best fit
@ -99,7 +99,7 @@ fn locale_resolution() {
},
};
let locale = resolve_locale::<TestService>([], &mut options, &provider);
let locale = resolve_locale::<TestService>([], &mut options, &provider).unwrap();
assert_eq!(locale, default);
// requested: [es-ES]
@ -108,6 +108,7 @@ fn locale_resolution() {
service_options: TestOptions { hc: None },
};
let locale = resolve_locale::<TestService>([locale!("es-AR")], &mut options, &provider);
let locale =
resolve_locale::<TestService>([locale!("es-AR")], &mut options, &provider).unwrap();
assert_eq!(locale, "es-u-hc-h23".parse().unwrap());
}

23
core/engine/src/builtins/intl/locale/utils.rs

@ -132,7 +132,7 @@ pub(crate) fn canonicalize_locale_list(
// vi. Let canonicalizedTag be CanonicalizeUnicodeLocaleId(tag).
context
.intl_provider()
.locale_canonicalizer()
.locale_canonicalizer()?
.canonicalize(&mut tag);
// vii. If canonicalizedTag is not an element of seen, append canonicalizedTag as the last element of seen.
@ -316,7 +316,7 @@ pub(in crate::builtins::intl) fn resolve_locale<S>(
requested_locales: impl IntoIterator<Item = Locale>,
options: &mut IntlOptions<S::LocaleOptions>,
provider: &IntlProvider,
) -> Locale
) -> JsResult<Locale>
where
S: Service,
IntlProvider: DataProvider<S::LangMarker>,
@ -327,12 +327,17 @@ where
// 3. Else,
// a. Let r be LookupMatchingLocaleByBestFit(availableLocales, requestedLocales).
// 4. If r is undefined, set r to the Record { [[locale]]: DefaultLocale(), [[extension]]: empty }.
let mut found_locale = if options.matcher == LocaleMatcher::Lookup {
let found_locale = if options.matcher == LocaleMatcher::Lookup {
lookup_matching_locale_by_prefix::<S::LangMarker>(requested_locales, provider)
} else {
lookup_matching_locale_by_best_fit::<S::LangMarker>(requested_locales, provider)
}
.unwrap_or_else(|| default_locale(provider.locale_canonicalizer()));
};
let mut found_locale = if let Some(loc) = found_locale {
loc
} else {
default_locale(provider.locale_canonicalizer()?)
};
// From here, the spec differs significantly from the implementation,
// since ICU4X allows us to skip some steps and modularize the
@ -388,9 +393,9 @@ where
// 12. Return result.
S::resolve(&mut found_locale, &mut options.service_options, provider);
provider
.locale_canonicalizer()
.locale_canonicalizer()?
.canonicalize(&mut found_locale);
found_locale
Ok(found_locale)
}
/// Abstract operation [`FilterLocales ( availableLocales, requestedLocales, options )`][spec]
@ -493,7 +498,7 @@ mod tests {
#[test]
fn best_fit() {
let icu = &IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer()).unwrap();
let icu = &IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer());
assert_eq!(
lookup_matching_locale_by_best_fit::<CardinalV1Marker>([locale!("en")], icu),
@ -513,7 +518,7 @@ mod tests {
#[test]
fn lookup_match() {
let icu = &IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer()).unwrap();
let icu = &IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer());
// requested: [fr-FR-u-hc-h12]
let requested: Locale = "fr-FR-u-hc-h12".parse().unwrap();

30
core/engine/src/builtins/intl/number_format/mod.rs

@ -15,6 +15,7 @@ use icu_locid::{
extensions::unicode::{key, Value},
Locale,
};
use icu_provider::DataLocale;
use num_bigint::BigInt;
use num_traits::Num;
pub(crate) use options::*;
@ -24,7 +25,10 @@ use crate::{
builder::BuiltInBuilder, options::get_option, string::is_trimmable_whitespace,
BuiltInConstructor, BuiltInObject, IntrinsicObject,
},
context::intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
context::{
icu::ErasedProvider,
intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
},
js_string,
object::{
internal_methods::get_prototype_from_constructor, FunctionObjectBuilder, JsFunction,
@ -240,7 +244,7 @@ impl BuiltInConstructor for NumberFormat {
requested_locales,
&mut intl_options,
context.intl_provider(),
);
)?;
// 11. Set numberFormat.[[Locale]] to r.[[locale]].
// 12. Set numberFormat.[[DataLocale]] to r.[[dataLocale]].
@ -365,15 +369,19 @@ impl BuiltInConstructor for NumberFormat {
let sign_display =
get_option(&options, js_str!("signDisplay"), context)?.unwrap_or(SignDisplay::Auto);
let formatter = FixedDecimalFormatter::try_new_unstable(
context.intl_provider(),
&locale.clone().into(),
{
let mut options = FixedDecimalFormatterOptions::default();
options.grouping_strategy = use_grouping;
options
},
)
let mut options = FixedDecimalFormatterOptions::default();
options.grouping_strategy = use_grouping;
let data_locale = &DataLocale::from(&locale);
let formatter = match context.intl_provider().erased_provider() {
ErasedProvider::Any(a) => {
FixedDecimalFormatter::try_new_with_any_provider(a, data_locale, options)
}
ErasedProvider::Buffer(b) => {
FixedDecimalFormatter::try_new_with_buffer_provider(b, data_locale, options)
}
}
.map_err(|err| JsNativeError::typ().with_message(err.to_string()))?;
let number_format = JsObject::from_proto_and_data_with_shared_shape(

28
core/engine/src/builtins/intl/plural_rules/mod.rs

@ -16,7 +16,10 @@ use crate::{
options::get_option, Array, BuiltInBuilder, BuiltInConstructor, BuiltInObject,
IntrinsicObject,
},
context::intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
context::{
icu::ErasedProvider,
intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
},
js_string,
object::{internal_methods::get_prototype_from_constructor, ObjectInitializer},
property::Attribute,
@ -142,21 +145,16 @@ impl BuiltInConstructor for PluralRules {
..Default::default()
},
context.intl_provider(),
);
)?;
let native = match rule_type {
PluralRuleType::Cardinal => PluralRulesWithRanges::try_new_cardinal_unstable(
context.intl_provider(),
&DataLocale::from(&locale),
),
PluralRuleType::Ordinal => PluralRulesWithRanges::try_new_ordinal_unstable(
context.intl_provider(),
&DataLocale::from(&locale),
),
_ => {
return Err(JsNativeError::typ()
.with_message("unimplemented plural rule type")
.into())
let data_locale = &DataLocale::from(&locale);
let native = match context.intl_provider().erased_provider() {
ErasedProvider::Any(a) => {
PluralRulesWithRanges::try_new_with_any_provider(a, data_locale, rule_type)
}
ErasedProvider::Buffer(b) => {
PluralRulesWithRanges::try_new_with_buffer_provider(b, data_locale, rule_type)
}
}
.map_err(|e| JsNativeError::typ().with_message(e.to_string()))?;

41
core/engine/src/builtins/intl/segmenter/mod.rs

@ -12,7 +12,10 @@ use crate::{
options::{get_option, get_options_object},
BuiltInBuilder, BuiltInConstructor, BuiltInObject, IntrinsicObject,
},
context::intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
context::{
icu::ErasedProvider,
intrinsics::{Intrinsics, StandardConstructor, StandardConstructors},
},
js_string,
object::{internal_methods::get_prototype_from_constructor, JsObject, ObjectInitializer},
property::Attribute,
@ -155,24 +158,38 @@ impl BuiltInConstructor for Segmenter {
..Default::default()
},
context.intl_provider(),
);
)?;
// 12. Let granularity be ? GetOption(options, "granularity", string, « "grapheme", "word", "sentence" », "grapheme").
let granularity =
get_option(&options, js_str!("granularity"), context)?.unwrap_or_default();
// 13. Set segmenter.[[SegmenterGranularity]] to granularity.
let native = match granularity {
Granularity::Grapheme => {
GraphemeClusterSegmenter::try_new_unstable(context.intl_provider())
// 13. Set segmenter.[[SegmenterGranularity]] to granularity.
let native = match (granularity, context.intl_provider().erased_provider()) {
(Granularity::Grapheme, ErasedProvider::Any(a)) => {
GraphemeClusterSegmenter::try_new_with_any_provider(a)
.map(|s| NativeSegmenter::Grapheme(Box::new(s)))
}
Granularity::Word => WordSegmenter::try_new_auto_unstable(context.intl_provider())
.map(|s| NativeSegmenter::Word(Box::new(s))),
Granularity::Sentence => SentenceSegmenter::try_new_unstable(context.intl_provider())
.map(|s| NativeSegmenter::Sentence(Box::new(s))),
(Granularity::Word, ErasedProvider::Any(a)) => {
WordSegmenter::try_new_auto_with_any_provider(a)
.map(|s| NativeSegmenter::Word(Box::new(s)))
}
(Granularity::Sentence, ErasedProvider::Any(a)) => {
SentenceSegmenter::try_new_with_any_provider(a)
.map(|s| NativeSegmenter::Sentence(Box::new(s)))
}
(Granularity::Grapheme, ErasedProvider::Buffer(b)) => {
GraphemeClusterSegmenter::try_new_with_buffer_provider(b)
.map(|s| NativeSegmenter::Grapheme(Box::new(s)))
}
(Granularity::Word, ErasedProvider::Buffer(b)) => {
WordSegmenter::try_new_auto_with_buffer_provider(b)
.map(|s| NativeSegmenter::Word(Box::new(s)))
}
(Granularity::Sentence, ErasedProvider::Buffer(b)) => {
SentenceSegmenter::try_new_with_buffer_provider(b)
.map(|s| NativeSegmenter::Sentence(Box::new(s)))
}
}
.map_err(|err| JsNativeError::typ().with_message(err.to_string()))?;

19
core/engine/src/builtins/string/mod.rs

@ -1763,13 +1763,18 @@ impl String {
// 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
// 2. If requestedLocales is not an empty List, then
// a. Let requestedLocale be requestedLocales[0].
let mut requested_locale = canonicalize_locale_list(args.get_or_undefined(0), context)?
.into_iter()
.next()
let mut requested_locale = if let Some(locale) =
canonicalize_locale_list(args.get_or_undefined(0), context)?
.into_iter()
.next()
{
// a. Let requestedLocale be requestedLocales[0].
locale
} else {
// 3. Else,
// a. Let requestedLocale be ! DefaultLocale().
.unwrap_or_else(|| default_locale(context.intl_provider().locale_canonicalizer()));
default_locale(context.intl_provider().locale_canonicalizer()?)
};
// 4. Let noExtensionsLocale be the String value that is requestedLocale with any Unicode locale extension sequences (6.2.1) removed.
requested_locale.extensions.unicode.clear();
@ -1784,7 +1789,7 @@ impl String {
)
.unwrap_or(Locale::UND);
let casemapper = context.intl_provider().case_mapper();
let casemapper = context.intl_provider().case_mapper()?;
// 8. Let codePoints be StringToCodePoints(S).
let result = string.map_valid_segments(|segment| {
@ -2165,7 +2170,7 @@ impl String {
}
#[cfg(feature = "intl")]
{
context.intl_provider().string_normalizers()
context.intl_provider().string_normalizers()?
}
};

123
core/engine/src/context/icu.rs

@ -1,4 +1,4 @@
use std::fmt::Debug;
use std::{cell::OnceCell, fmt::Debug};
use icu_casemap::CaseMapper;
use icu_locid_transform::{LocaleCanonicalizer, LocaleExpander, LocaleTransformError};
@ -12,10 +12,10 @@ use thiserror::Error;
use yoke::{trait_hack::YokeTraitHack, Yokeable};
use zerofrom::ZeroFrom;
use crate::builtins::string::StringNormalizers;
use crate::{builtins::string::StringNormalizers, JsError, JsNativeError};
/// A [`DataProvider`] that can be either a [`BufferProvider`] or an [`AnyProvider`].
enum ErasedProvider {
pub(crate) enum ErasedProvider {
Any(Box<dyn AnyProvider>),
Buffer(Box<dyn BufferProvider>),
}
@ -34,13 +34,25 @@ pub enum IcuError {
CaseMap(#[from] DataError),
}
impl From<IcuError> for JsNativeError {
fn from(value: IcuError) -> Self {
JsNativeError::typ().with_message(value.to_string())
}
}
impl From<IcuError> for JsError {
fn from(value: IcuError) -> Self {
JsNativeError::from(value).into()
}
}
/// Custom [`DataProvider`] for `Intl` that caches some utilities.
pub(crate) struct IntlProvider {
inner_provider: ErasedProvider,
locale_canonicalizer: LocaleCanonicalizer,
locale_expander: LocaleExpander,
string_normalizers: StringNormalizers,
case_mapper: CaseMapper,
locale_canonicalizer: OnceCell<LocaleCanonicalizer>,
locale_expander: OnceCell<LocaleExpander>,
string_normalizers: OnceCell<StringNormalizers>,
case_mapper: OnceCell<CaseMapper>,
}
impl<M> DataProvider<M> for IntlProvider
@ -76,19 +88,14 @@ impl IntlProvider {
/// Returns an error if any of the tools required cannot be constructed.
pub(crate) fn try_new_with_buffer_provider(
provider: (impl BufferProvider + 'static),
) -> Result<IntlProvider, IcuError> {
Ok(Self {
locale_canonicalizer: LocaleCanonicalizer::try_new_with_buffer_provider(&provider)?,
locale_expander: LocaleExpander::try_new_with_buffer_provider(&provider)?,
string_normalizers: StringNormalizers {
nfc: ComposingNormalizer::try_new_nfc_with_buffer_provider(&provider)?,
nfkc: ComposingNormalizer::try_new_nfkc_with_buffer_provider(&provider)?,
nfd: DecomposingNormalizer::try_new_nfd_with_buffer_provider(&provider)?,
nfkd: DecomposingNormalizer::try_new_nfkd_with_buffer_provider(&provider)?,
},
case_mapper: CaseMapper::try_new_with_buffer_provider(&provider)?,
) -> IntlProvider {
Self {
locale_canonicalizer: OnceCell::new(),
locale_expander: OnceCell::new(),
string_normalizers: OnceCell::new(),
case_mapper: OnceCell::new(),
inner_provider: ErasedProvider::Buffer(Box::new(provider)),
})
}
}
/// Creates a new [`IntlProvider`] from an [`AnyProvider`].
@ -98,38 +105,76 @@ impl IntlProvider {
/// Returns an error if any of the tools required cannot be constructed.
pub(crate) fn try_new_with_any_provider(
provider: (impl AnyProvider + 'static),
) -> Result<IntlProvider, IcuError> {
Ok(Self {
locale_canonicalizer: LocaleCanonicalizer::try_new_with_any_provider(&provider)?,
locale_expander: LocaleExpander::try_new_extended_with_any_provider(&provider)?,
string_normalizers: StringNormalizers {
nfc: ComposingNormalizer::try_new_nfc_with_any_provider(&provider)?,
nfkc: ComposingNormalizer::try_new_nfkc_with_any_provider(&provider)?,
nfd: DecomposingNormalizer::try_new_nfd_with_any_provider(&provider)?,
nfkd: DecomposingNormalizer::try_new_nfkd_with_any_provider(&provider)?,
},
case_mapper: CaseMapper::try_new_with_any_provider(&provider)?,
) -> IntlProvider {
Self {
locale_canonicalizer: OnceCell::new(),
locale_expander: OnceCell::new(),
string_normalizers: OnceCell::new(),
case_mapper: OnceCell::new(),
inner_provider: ErasedProvider::Any(Box::new(provider)),
})
}
}
/// Gets the [`LocaleCanonicalizer`] tool.
pub(crate) const fn locale_canonicalizer(&self) -> &LocaleCanonicalizer {
&self.locale_canonicalizer
pub(crate) fn locale_canonicalizer(&self) -> Result<&LocaleCanonicalizer, IcuError> {
if let Some(lc) = self.locale_canonicalizer.get() {
return Ok(lc);
}
let lc = match &self.inner_provider {
ErasedProvider::Any(a) => LocaleCanonicalizer::try_new_with_any_provider(a)?,
ErasedProvider::Buffer(b) => LocaleCanonicalizer::try_new_with_buffer_provider(b)?,
};
Ok(self.locale_canonicalizer.get_or_init(|| lc))
}
/// Gets the [`LocaleExpander`] tool.
pub(crate) const fn locale_expander(&self) -> &LocaleExpander {
&self.locale_expander
pub(crate) fn locale_expander(&self) -> Result<&LocaleExpander, IcuError> {
if let Some(le) = self.locale_expander.get() {
return Ok(le);
}
let le = match &self.inner_provider {
ErasedProvider::Any(a) => LocaleExpander::try_new_with_any_provider(a)?,
ErasedProvider::Buffer(b) => LocaleExpander::try_new_with_buffer_provider(b)?,
};
Ok(self.locale_expander.get_or_init(|| le))
}
/// Gets the [`StringNormalizers`] tools.
pub(crate) const fn string_normalizers(&self) -> &StringNormalizers {
&self.string_normalizers
pub(crate) fn string_normalizers(&self) -> Result<&StringNormalizers, IcuError> {
if let Some(sn) = self.string_normalizers.get() {
return Ok(sn);
}
let sn = match &self.inner_provider {
ErasedProvider::Any(a) => StringNormalizers {
nfc: ComposingNormalizer::try_new_nfc_with_any_provider(a)?,
nfkc: ComposingNormalizer::try_new_nfkc_with_any_provider(a)?,
nfd: DecomposingNormalizer::try_new_nfd_with_any_provider(a)?,
nfkd: DecomposingNormalizer::try_new_nfkd_with_any_provider(a)?,
},
ErasedProvider::Buffer(b) => StringNormalizers {
nfc: ComposingNormalizer::try_new_nfc_with_buffer_provider(b)?,
nfkc: ComposingNormalizer::try_new_nfkc_with_buffer_provider(b)?,
nfd: DecomposingNormalizer::try_new_nfd_with_buffer_provider(b)?,
nfkd: DecomposingNormalizer::try_new_nfkd_with_buffer_provider(b)?,
},
};
Ok(self.string_normalizers.get_or_init(|| sn))
}
/// Gets the [`CaseMapper`] tool.
pub(crate) const fn case_mapper(&self) -> &CaseMapper {
&self.case_mapper
pub(crate) fn case_mapper(&self) -> Result<&CaseMapper, IcuError> {
if let Some(cm) = self.case_mapper.get() {
return Ok(cm);
}
let cm = match &self.inner_provider {
ErasedProvider::Any(a) => CaseMapper::try_new_with_any_provider(a)?,
ErasedProvider::Buffer(b) => CaseMapper::try_new_with_buffer_provider(b)?,
};
Ok(self.case_mapper.get_or_init(|| cm))
}
/// Gets the inner erased provider.
pub(crate) fn erased_provider(&self) -> &ErasedProvider {
&self.inner_provider
}
}

5
core/engine/src/context/mod.rs

@ -958,7 +958,7 @@ impl ContextBuilder {
mut self,
provider: T,
) -> Result<Self, IcuError> {
self.icu = Some(icu::IntlProvider::try_new_with_buffer_provider(provider)?);
self.icu = Some(icu::IntlProvider::try_new_with_buffer_provider(provider));
Ok(self)
}
@ -992,7 +992,7 @@ impl ContextBuilder {
mut self,
provider: T,
) -> Result<Self, IcuError> {
self.icu = Some(icu::IntlProvider::try_new_with_any_provider(provider)?);
self.icu = Some(icu::IntlProvider::try_new_with_any_provider(provider));
Ok(self)
}
@ -1093,7 +1093,6 @@ impl ContextBuilder {
cfg_if::cfg_if! {
if #[cfg(feature = "intl_bundled")] {
icu::IntlProvider::try_new_with_buffer_provider(boa_icu_provider::buffer())
.expect("Failed to initialize default icu data.")
} else {
return Err(JsNativeError::typ()
.with_message("missing Intl provider for context")

10
core/icu_provider/Cargo.toml

@ -14,7 +14,17 @@ rust-version.workspace = true
icu_provider = { workspace = true, features = ["sync"] }
icu_provider_blob.workspace = true
icu_provider_adapters = { workspace = true, features = ["serde"] }
icu_casemap = { workspace = true, features = ["serde", "datagen"] }
icu_collator = { workspace = true, features = ["serde", "datagen"] }
icu_datetime = { workspace = true, features = ["serde", "datagen"] }
icu_decimal = { workspace = true, features = ["serde", "datagen"] }
icu_list = { workspace = true, features = ["serde", "datagen"] }
icu_locid_transform = { workspace = true, features = ["serde", "datagen"] }
icu_normalizer = { workspace = true, features = ["serde", "datagen"] }
icu_plurals = { workspace = true, features = ["serde", "datagen", "experimental"] }
icu_segmenter = { workspace = true, features = ["serde", "datagen"] }
once_cell = { workspace = true, default-features = false, features = ["critical-section"] }
paste.workspace = true
[features]
default = ["std"]

BIN
core/icu_provider/data/icu_casemap.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_collator.postcard

Binary file not shown.

BIN
core/icu_provider/data/icudata.postcard → core/icu_provider/data/icu_datetime.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_decimal.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_list.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_locid_transform.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_normalizer.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_plurals.postcard

Binary file not shown.

BIN
core/icu_provider/data/icu_segmenter.postcard

Binary file not shown.

97
core/icu_provider/src/lib.rs

@ -21,24 +21,95 @@
)]
#![cfg_attr(not(feature = "std"), no_std)]
use icu_provider_adapters::fallback::LocaleFallbackProvider;
extern crate alloc;
use core::fmt::Debug;
use icu_provider::{BufferMarker, BufferProvider, DataError, DataErrorKind, DataKey, DataResponse};
use icu_provider_adapters::{fallback::LocaleFallbackProvider, fork::MultiForkByKeyProvider};
use icu_provider_blob::BlobDataProvider;
use once_cell::sync::Lazy;
use once_cell::sync::{Lazy, OnceCell};
/// Gets the default data provider stored as a [`BufferProvider`].
/// A buffer provider that is lazily deserialized at the first data request.
///
/// [`BufferProvider`]: icu_provider::BufferProvider
#[must_use]
pub fn buffer() -> &'static impl icu_provider::BufferProvider {
static PROVIDER: Lazy<LocaleFallbackProvider<BlobDataProvider>> = Lazy::new(|| {
let blob = BlobDataProvider::try_new_from_static_blob(include_bytes!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/icudata.postcard"
)))
.expect("The statically compiled data file should be valid.");
LocaleFallbackProvider::try_new_with_buffer_provider(blob)
/// The provider must specify the list of keys it supports, to avoid deserializing the
/// buffer for unknown keys.
struct LazyBufferProvider {
provider: OnceCell<BlobDataProvider>,
bytes: &'static [u8],
valid_keys: &'static [DataKey],
}
impl Debug for LazyBufferProvider {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("LazyBufferProvider")
.field("provider", &self.provider)
.field("bytes", &"[...]")
.field("valid_keys", &self.valid_keys)
.finish()
}
}
impl BufferProvider for LazyBufferProvider {
fn load_buffer(
&self,
key: DataKey,
req: icu_provider::DataRequest<'_>,
) -> Result<DataResponse<BufferMarker>, DataError> {
if !self.valid_keys.contains(&key) {
return Err(DataErrorKind::MissingDataKey.with_key(key));
}
let Ok(provider) = self
.provider
.get_or_try_init(|| BlobDataProvider::try_new_from_static_blob(self.bytes))
else {
return Err(DataErrorKind::Custom.with_str_context("invalid blob data provider"));
};
provider.load_buffer(key, req)
}
}
/// A macro that creates a [`LazyBufferProvider`] from an icu4x crate.
macro_rules! provider_from_icu_crate {
($service:path) => {
paste::paste! {
LazyBufferProvider {
provider: OnceCell::new(),
bytes: include_bytes!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/",
stringify!($service),
".postcard",
)),
valid_keys: $service::provider::KEYS,
}
}
};
}
/// Boa's default buffer provider.
static PROVIDER: Lazy<LocaleFallbackProvider<MultiForkByKeyProvider<LazyBufferProvider>>> =
Lazy::new(|| {
let provider = MultiForkByKeyProvider::new(alloc::vec![
provider_from_icu_crate!(icu_casemap),
provider_from_icu_crate!(icu_collator),
provider_from_icu_crate!(icu_datetime),
provider_from_icu_crate!(icu_decimal),
provider_from_icu_crate!(icu_list),
provider_from_icu_crate!(icu_locid_transform),
provider_from_icu_crate!(icu_normalizer),
provider_from_icu_crate!(icu_plurals),
provider_from_icu_crate!(icu_segmenter),
]);
LocaleFallbackProvider::try_new_with_buffer_provider(provider)
.expect("The statically compiled data file should be valid.")
});
/// Gets the default data provider stored as a [`BufferProvider`].
///
/// [`BufferProvider`]: icu_provider::BufferProvider
#[must_use]
pub fn buffer() -> &'static impl BufferProvider {
&*PROVIDER
}

1
tools/gen-icu4x-data/Cargo.toml

@ -10,7 +10,6 @@ license.workspace = true
description.workspace = true
[dependencies]
icu_provider.workspace = true
icu_datagen = { workspace = true, features = [
"networking",
"use_wasm",

100
tools/gen-icu4x-data/src/main.rs

@ -1,51 +1,50 @@
#![allow(missing_docs, rustdoc::missing_crate_level_docs)]
use std::{error::Error, fs::File, path::Path};
use std::path::Path;
use std::{error::Error, fs::File};
use icu_datagen::blob_exporter::BlobExporter;
use icu_datagen::prelude::*;
use icu_provider::data_key;
const KEYS_LEN: usize = 129;
/// Path to the directory where the exported data lives.
const EXPORT_PATH: &str = "core/icu_provider/data";
/// List of keys used by `Intl` components.
/// List of services used by `Intl` components.
///
/// This must be kept in sync with the list of implemented components of `Intl`.
const KEYS: [DataKey; KEYS_LEN] = {
const CENTINEL_KEY: DataKey = data_key!("centinel@1");
const SERVICES: [&[DataKey]; 9] = [
icu_casemap::provider::KEYS,
icu_collator::provider::KEYS,
icu_datetime::provider::KEYS,
icu_decimal::provider::KEYS,
icu_list::provider::KEYS,
icu_locid_transform::provider::KEYS,
icu_normalizer::provider::KEYS,
icu_plurals::provider::KEYS,
icu_segmenter::provider::KEYS,
];
/// This must be kept in sync with the list of implemented services of `Intl`.
const SERVICES: &[(&str, &[DataKey])] = &[
("icu_casemap", icu_casemap::provider::KEYS),
("icu_collator", icu_collator::provider::KEYS),
("icu_datetime", icu_datetime::provider::KEYS),
("icu_decimal", icu_decimal::provider::KEYS),
("icu_list", icu_list::provider::KEYS),
("icu_locid_transform", icu_locid_transform::provider::KEYS),
("icu_normalizer", icu_normalizer::provider::KEYS),
("icu_plurals", icu_plurals::provider::KEYS),
("icu_segmenter", icu_segmenter::provider::KEYS),
];
fn export_for_service(
service: &str,
keys: &[DataKey],
provider: &DatagenProvider,
driver: DatagenDriver,
) -> Result<(), Box<dyn Error>> {
log::info!(
"Generating ICU4X data for service `{service}` with keys: {:#?}",
keys
);
let export_path = Path::new(EXPORT_PATH);
let export_file = export_path.join(format!("{service}.postcard"));
driver.with_keys(keys.iter().copied()).export(
provider,
BlobExporter::new_v2_with_sink(Box::new(File::create(export_file)?)),
)?;
let mut array = [CENTINEL_KEY; KEYS_LEN];
let mut offset = 0;
let mut service_idx = 0;
while service_idx < SERVICES.len() {
let service = SERVICES[service_idx];
let mut idx = 0;
while idx < service.len() {
array[offset + idx] = service[idx];
idx += 1;
}
offset += service.len();
service_idx += 1;
}
assert!(offset == array.len());
array
};
Ok(())
}
fn main() -> Result<(), Box<dyn Error>> {
simple_logger::SimpleLogger::new()
@ -53,34 +52,29 @@ fn main() -> Result<(), Box<dyn Error>> {
.with_level(log::LevelFilter::Info)
.init()?;
let path = Path::new("core/icu_provider/data");
// Removal will throw an error if the directory doesn't exist, hence
// why we can ignore the error.
let _unused = std::fs::remove_dir_all(path);
std::fs::create_dir_all(path)?;
let _unused = std::fs::remove_dir_all(EXPORT_PATH);
std::fs::create_dir_all(EXPORT_PATH)?;
log::info!("Generating ICU4X data for keys: {:#?}", KEYS);
let provider = DatagenProvider::new_latest_tested();
let provider = &DatagenProvider::new_latest_tested();
let locales = provider
.locales_for_coverage_levels([CoverageLevel::Modern])?
.into_iter()
.chain([langid!("en-US")]);
DatagenDriver::new()
.with_keys(KEYS)
let driver = DatagenDriver::new()
.with_locales_and_fallback(locales.map(LocaleFamily::with_descendants), {
let mut options = FallbackOptions::default();
options.deduplication_strategy = Some(DeduplicationStrategy::None);
options
})
.with_additional_collations([String::from("search*")])
.with_recommended_segmenter_models()
.export(
&provider,
BlobExporter::new_v2_with_sink(Box::new(File::create(path.join("icudata.postcard"))?)),
)?;
.with_recommended_segmenter_models();
for (service, keys) in SERVICES {
export_for_service(service, keys, provider, driver.clone())?;
}
Ok(())
}

Loading…
Cancel
Save