mirror of https://github.com/boa-dev/boa.git
Browse Source
The new ICU4X release stabilized the `icu_segmenter` component, so this PR implements `Intl.Segmenter` using that as a base. Also, I opted for importing `itertools` instead of copy-pasting the implementation of `TupleWindows` because its design is a lot more complex than `Intersperse`, which we copy-pasted previously. Though, I disabled all `std` features of `itertools` to make it a lot more lightweight, so it shouldn't make much difference in compilation times.pull/2852/head
José Julián Espina
2 years ago
14 changed files with 828 additions and 157 deletions
@ -0,0 +1,152 @@ |
|||||||
|
use boa_gc::{Finalize, Trace}; |
||||||
|
use boa_profiler::Profiler; |
||||||
|
use icu_segmenter::{ |
||||||
|
GraphemeClusterBreakIteratorUtf16, SentenceBreakIteratorUtf16, WordBreakIteratorUtf16, |
||||||
|
}; |
||||||
|
|
||||||
|
use crate::{ |
||||||
|
builtins::{iterable::create_iter_result_object, BuiltInBuilder, IntrinsicObject}, |
||||||
|
context::intrinsics::Intrinsics, |
||||||
|
js_string, |
||||||
|
object::ObjectData, |
||||||
|
property::Attribute, |
||||||
|
realm::Realm, |
||||||
|
Context, JsNativeError, JsObject, JsResult, JsString, JsSymbol, JsValue, |
||||||
|
}; |
||||||
|
|
||||||
|
use super::create_segment_data_object; |
||||||
|
|
||||||
|
pub(crate) enum NativeSegmentIterator<'l, 's> { |
||||||
|
Grapheme(GraphemeClusterBreakIteratorUtf16<'l, 's>), |
||||||
|
Word(WordBreakIteratorUtf16<'l, 's>), |
||||||
|
Sentence(SentenceBreakIteratorUtf16<'l, 's>), |
||||||
|
} |
||||||
|
|
||||||
|
impl Iterator for NativeSegmentIterator<'_, '_> { |
||||||
|
type Item = usize; |
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> { |
||||||
|
match self { |
||||||
|
NativeSegmentIterator::Grapheme(g) => g.next(), |
||||||
|
NativeSegmentIterator::Word(w) => w.next(), |
||||||
|
NativeSegmentIterator::Sentence(s) => s.next(), |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
impl NativeSegmentIterator<'_, '_> { |
||||||
|
/// If the iterator is a word break iterator, returns `Some(true)` when the segment preceding
|
||||||
|
/// the current boundary is word-like.
|
||||||
|
pub(crate) fn is_word_like(&self) -> Option<bool> { |
||||||
|
if let Self::Word(w) = self { |
||||||
|
Some(w.is_word_like()) |
||||||
|
} else { |
||||||
|
None |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#[derive(Debug, Trace, Finalize)] |
||||||
|
pub struct SegmentIterator { |
||||||
|
segmenter: JsObject, |
||||||
|
string: JsString, |
||||||
|
next_segment_index: usize, |
||||||
|
} |
||||||
|
|
||||||
|
impl IntrinsicObject for SegmentIterator { |
||||||
|
fn init(realm: &Realm) { |
||||||
|
let _timer = Profiler::global().start_event("%SegmentIteratorPrototype%", "init"); |
||||||
|
|
||||||
|
BuiltInBuilder::with_intrinsic::<Self>(realm) |
||||||
|
.static_property( |
||||||
|
JsSymbol::to_string_tag(), |
||||||
|
js_string!("Segmenter String Iterator"), |
||||||
|
Attribute::CONFIGURABLE, |
||||||
|
) |
||||||
|
.static_method(Self::next, js_string!("next"), 0) |
||||||
|
.build(); |
||||||
|
} |
||||||
|
|
||||||
|
fn get(intrinsics: &Intrinsics) -> JsObject { |
||||||
|
intrinsics.objects().iterator_prototypes().segment() |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
impl SegmentIterator { |
||||||
|
/// [`CreateSegmentIterator ( segmenter, string )`][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma402/#sec-createsegmentiterator
|
||||||
|
pub(crate) fn create( |
||||||
|
segmenter: JsObject, |
||||||
|
string: JsString, |
||||||
|
context: &mut Context<'_>, |
||||||
|
) -> JsObject { |
||||||
|
// 1. Let internalSlotsList be « [[IteratingSegmenter]], [[IteratedString]], [[IteratedStringNextSegmentCodeUnitIndex]] ».
|
||||||
|
// 2. Let iterator be OrdinaryObjectCreate(%SegmentIteratorPrototype%, internalSlotsList).
|
||||||
|
// 3. Set iterator.[[IteratingSegmenter]] to segmenter.
|
||||||
|
// 4. Set iterator.[[IteratedString]] to string.
|
||||||
|
// 5. Set iterator.[[IteratedStringNextSegmentCodeUnitIndex]] to 0.
|
||||||
|
// 6. Return iterator.
|
||||||
|
JsObject::from_proto_and_data( |
||||||
|
context |
||||||
|
.intrinsics() |
||||||
|
.objects() |
||||||
|
.iterator_prototypes() |
||||||
|
.segment(), |
||||||
|
ObjectData::segment_iterator(Self { |
||||||
|
segmenter, |
||||||
|
string, |
||||||
|
next_segment_index: 0, |
||||||
|
}), |
||||||
|
) |
||||||
|
} |
||||||
|
/// [`%SegmentIteratorPrototype%.next ( )`][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma402/#sec-%segmentiteratorprototype%.next
|
||||||
|
fn next(this: &JsValue, _: &[JsValue], context: &mut Context<'_>) -> JsResult<JsValue> { |
||||||
|
// 1. Let iterator be the this value.
|
||||||
|
// 2. Perform ? RequireInternalSlot(iterator, [[IteratingSegmenter]]).
|
||||||
|
let mut iter = this.as_object().map(JsObject::borrow_mut).ok_or_else(|| { |
||||||
|
JsNativeError::typ() |
||||||
|
.with_message("`next` can only be called on a `Segment Iterator` object") |
||||||
|
})?; |
||||||
|
let iter = iter.as_segment_iterator_mut().ok_or_else(|| { |
||||||
|
JsNativeError::typ() |
||||||
|
.with_message("`next` can only be called on a `Segment Iterator` object") |
||||||
|
})?; |
||||||
|
|
||||||
|
// 5. Let startIndex be iterator.[[IteratedStringNextSegmentCodeUnitIndex]].
|
||||||
|
let start = iter.next_segment_index; |
||||||
|
|
||||||
|
// 4. Let string be iterator.[[IteratedString]].
|
||||||
|
// 6. Let endIndex be ! FindBoundary(segmenter, string, startIndex, after).
|
||||||
|
let Some((end, is_word_like)) = iter.string.get(start..).and_then(|string| { |
||||||
|
// 3. Let segmenter be iterator.[[IteratingSegmenter]].
|
||||||
|
let segmenter = iter.segmenter.borrow(); |
||||||
|
let segmenter = segmenter |
||||||
|
.as_segmenter() |
||||||
|
.expect("segment iterator object should contain a segmenter"); |
||||||
|
let mut segments = segmenter.native.segment(string); |
||||||
|
// the first elem is always 0.
|
||||||
|
segments.next(); |
||||||
|
segments.next().map(|end| (start + end, segments.is_word_like())) |
||||||
|
}) else { |
||||||
|
// 7. If endIndex is not finite, then
|
||||||
|
// a. Return CreateIterResultObject(undefined, true).
|
||||||
|
return Ok(create_iter_result_object(JsValue::undefined(), true, context)); |
||||||
|
}; |
||||||
|
// 8. Set iterator.[[IteratedStringNextSegmentCodeUnitIndex]] to endIndex.
|
||||||
|
iter.next_segment_index = end; |
||||||
|
|
||||||
|
// 9. Let segmentData be ! CreateSegmentDataObject(segmenter, string, startIndex, endIndex).
|
||||||
|
let segment_data = |
||||||
|
create_segment_data_object(iter.string.clone(), start..end, is_word_like, context); |
||||||
|
|
||||||
|
// 10. Return CreateIterResultObject(segmentData, false).
|
||||||
|
Ok(create_iter_result_object( |
||||||
|
segment_data.into(), |
||||||
|
false, |
||||||
|
context, |
||||||
|
)) |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,142 @@ |
|||||||
|
use boa_gc::{Finalize, Trace}; |
||||||
|
use boa_profiler::Profiler; |
||||||
|
use itertools::Itertools; |
||||||
|
|
||||||
|
use crate::{ |
||||||
|
builtins::{BuiltInBuilder, IntrinsicObject}, |
||||||
|
context::intrinsics::Intrinsics, |
||||||
|
js_string, |
||||||
|
object::ObjectData, |
||||||
|
realm::Realm, |
||||||
|
Context, JsArgs, JsNativeError, JsObject, JsResult, JsString, JsSymbol, JsValue, |
||||||
|
}; |
||||||
|
|
||||||
|
use super::{create_segment_data_object, SegmentIterator}; |
||||||
|
|
||||||
|
#[derive(Debug, Trace, Finalize)] |
||||||
|
pub struct Segments { |
||||||
|
segmenter: JsObject, |
||||||
|
string: JsString, |
||||||
|
} |
||||||
|
|
||||||
|
impl IntrinsicObject for Segments { |
||||||
|
fn init(realm: &Realm) { |
||||||
|
let _timer = Profiler::global().start_event("%SegmentsPrototype%", "init"); |
||||||
|
|
||||||
|
BuiltInBuilder::with_intrinsic::<Self>(realm) |
||||||
|
.static_method(Self::containing, "containing", 1) |
||||||
|
.static_method( |
||||||
|
Self::iterator, |
||||||
|
(JsSymbol::iterator(), js_string!("[Symbol.iterator]")), |
||||||
|
0, |
||||||
|
) |
||||||
|
.build(); |
||||||
|
} |
||||||
|
|
||||||
|
fn get(intrinsics: &Intrinsics) -> JsObject { |
||||||
|
intrinsics.objects().segments_prototype() |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
impl Segments { |
||||||
|
/// [`CreateSegmentsObject ( segmenter, string )`][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma402/#sec-createsegmentsobject
|
||||||
|
pub(crate) fn create( |
||||||
|
segmenter: JsObject, |
||||||
|
string: JsString, |
||||||
|
context: &mut Context<'_>, |
||||||
|
) -> JsObject { |
||||||
|
// 1. Let internalSlotsList be « [[SegmentsSegmenter]], [[SegmentsString]] ».
|
||||||
|
// 2. Let segments be OrdinaryObjectCreate(%SegmentsPrototype%, internalSlotsList).
|
||||||
|
// 3. Set segments.[[SegmentsSegmenter]] to segmenter.
|
||||||
|
// 4. Set segments.[[SegmentsString]] to string.
|
||||||
|
// 5. Return segments.
|
||||||
|
JsObject::from_proto_and_data( |
||||||
|
context.intrinsics().objects().segments_prototype(), |
||||||
|
ObjectData::segments(Segments { segmenter, string }), |
||||||
|
) |
||||||
|
} |
||||||
|
|
||||||
|
/// [`%SegmentsPrototype%.containing ( index )`][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma402/#sec-%segmentsprototype%.containing
|
||||||
|
fn containing( |
||||||
|
this: &JsValue, |
||||||
|
args: &[JsValue], |
||||||
|
context: &mut Context<'_>, |
||||||
|
) -> JsResult<JsValue> { |
||||||
|
// 1. Let segments be the this value.
|
||||||
|
// 2. Perform ? RequireInternalSlot(segments, [[SegmentsSegmenter]]).
|
||||||
|
let segments = this.as_object().map(JsObject::borrow).ok_or_else(|| { |
||||||
|
JsNativeError::typ() |
||||||
|
.with_message("`containing` can only be called on a `Segments` object") |
||||||
|
})?; |
||||||
|
let segments = segments.as_segments().ok_or_else(|| { |
||||||
|
JsNativeError::typ() |
||||||
|
.with_message("`containing` can only be called on a `Segments` object") |
||||||
|
})?; |
||||||
|
|
||||||
|
// 3. Let segmenter be segments.[[SegmentsSegmenter]].
|
||||||
|
let segmenter = segments.segmenter.borrow(); |
||||||
|
let segmenter = segmenter |
||||||
|
.as_segmenter() |
||||||
|
.expect("segments object should contain a segmenter"); |
||||||
|
|
||||||
|
// 4. Let string be segments.[[SegmentsString]].
|
||||||
|
// 5. Let len be the length of string.
|
||||||
|
let len = segments.string.len() as i64; |
||||||
|
|
||||||
|
// 6. Let n be ? ToIntegerOrInfinity(index).
|
||||||
|
let Some(n) = args |
||||||
|
.get_or_undefined(0) |
||||||
|
.to_integer_or_infinity(context)? |
||||||
|
.as_integer() |
||||||
|
// 7. If n < 0 or n ≥ len, return undefined.
|
||||||
|
.filter(|i| (0..len).contains(i)) |
||||||
|
.map(|n| n as usize) else { |
||||||
|
return Ok(JsValue::undefined()); |
||||||
|
}; |
||||||
|
|
||||||
|
// 8. Let startIndex be ! FindBoundary(segmenter, string, n, before).
|
||||||
|
// 9. Let endIndex be ! FindBoundary(segmenter, string, n, after).
|
||||||
|
let (range, is_word_like) = { |
||||||
|
let mut segments = segmenter.native.segment(&segments.string); |
||||||
|
std::iter::from_fn(|| segments.next().map(|i| (i, segments.is_word_like()))) |
||||||
|
.tuple_windows() |
||||||
|
.find(|((i, _), (j, _))| (*i..*j).contains(&n)) |
||||||
|
.map(|((i, _), (j, word))| ((i..j), word)) |
||||||
|
.expect("string should have at least a length of 1, and `n` must be in range") |
||||||
|
}; |
||||||
|
|
||||||
|
// 10. Return ! CreateSegmentDataObject(segmenter, string, startIndex, endIndex).
|
||||||
|
Ok( |
||||||
|
create_segment_data_object(segments.string.clone(), range, is_word_like, context) |
||||||
|
.into(), |
||||||
|
) |
||||||
|
} |
||||||
|
|
||||||
|
/// [`%SegmentsPrototype% [ @@iterator ] ( )`][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma402/#sec-%segmentsprototype%-@@iterator
|
||||||
|
fn iterator(this: &JsValue, _: &[JsValue], context: &mut Context<'_>) -> JsResult<JsValue> { |
||||||
|
// 1. Let segments be the this value.
|
||||||
|
// 2. Perform ? RequireInternalSlot(segments, [[SegmentsSegmenter]]).
|
||||||
|
let segments = this.as_object().map(JsObject::borrow).ok_or_else(|| { |
||||||
|
JsNativeError::typ() |
||||||
|
.with_message("`containing` can only be called on a `Segments` object") |
||||||
|
})?; |
||||||
|
let segments = segments.as_segments().ok_or_else(|| { |
||||||
|
JsNativeError::typ() |
||||||
|
.with_message("`containing` can only be called on a `Segments` object") |
||||||
|
})?; |
||||||
|
|
||||||
|
// 3. Let segmenter be segments.[[SegmentsSegmenter]].
|
||||||
|
// 4. Let string be segments.[[SegmentsString]].
|
||||||
|
// 5. Return ! CreateSegmentIterator(segmenter, string).
|
||||||
|
Ok( |
||||||
|
SegmentIterator::create(segments.segmenter.clone(), segments.string.clone(), context) |
||||||
|
.into(), |
||||||
|
) |
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue