Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use script fallback in collator and transliterator #5403

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions components/collator/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,11 @@ fn data_ce_to_primary(data_ce: u64, c: char) -> u32 {
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(
marker(
CollationRootV1Marker,
"collator/root@1",
singleton,
),
marker(CollationRootV1Marker, "collator/root@1", singleton,),
marker(
CollationTailoringV1Marker,
"collator/tailoring@1",
// TODO(#3867): Use script fallback
fallback_by = "language",
fallback_by = "script",
attributes_domain = "collator",
)
)]
Expand Down
39 changes: 28 additions & 11 deletions components/experimental/src/transliterate/compile/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::cell::RefCell;
use icu_locale::provider::*;
use icu_locale_core::Locale;
use icu_normalizer::provider::*;
use icu_properties::{provider::*, sets};
Expand Down Expand Up @@ -135,24 +136,30 @@ impl RuleCollection {
#[cfg(feature = "compiled_data")]
pub fn as_provider(
&self,
) -> RuleCollectionProvider<'_, icu_properties::provider::Baked, icu_normalizer::provider::Baked>
{
) -> RuleCollectionProvider<
'_,
icu_properties::provider::Baked,
icu_normalizer::provider::Baked,
icu_locale::provider::Baked,
> {
RuleCollectionProvider {
collection: self,
properties_provider: &icu_properties::provider::Baked,
normalizer_provider: &icu_normalizer::provider::Baked,
locale_provider: &icu_locale::provider::Baked,
xid_start: sets::xid_start().static_to_owned(),
xid_continue: sets::xid_continue().static_to_owned(),
pat_ws: sets::pattern_white_space().static_to_owned(),
}
}

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::as_provider)]
pub fn as_provider_unstable<'a, PP, NP>(
pub fn as_provider_unstable<'a, PP, NP, LP>(
&'a self,
properties_provider: &'a PP,
normalizer_provider: &'a NP,
) -> Result<RuleCollectionProvider<'a, PP, NP>, DataError>
locale_provider: &'a LP,
) -> Result<RuleCollectionProvider<'a, PP, NP, LP>, DataError>
where
PP: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
Expand Down Expand Up @@ -227,6 +234,7 @@ impl RuleCollection {
collection: self,
properties_provider,
normalizer_provider,
locale_provider,
xid_start: sets::load_xid_start(properties_provider)?,
xid_continue: sets::load_xid_continue(properties_provider)?,
pat_ws: sets::load_pattern_white_space(properties_provider)?,
Expand All @@ -236,16 +244,18 @@ impl RuleCollection {

/// A provider that is usable by [`Transliterator::try_new_unstable`](crate::Transliterator::try_new_unstable).
#[derive(Debug)]
pub struct RuleCollectionProvider<'a, PP: ?Sized, NP: ?Sized> {
pub struct RuleCollectionProvider<'a, PP: ?Sized, NP: ?Sized, LP: ?Sized> {
collection: &'a RuleCollection,
properties_provider: &'a PP,
normalizer_provider: &'a NP,
locale_provider: &'a LP,
xid_start: sets::CodePointSetData,
xid_continue: sets::CodePointSetData,
pat_ws: sets::CodePointSetData,
}

impl<PP, NP> DataProvider<TransliteratorRulesV1Marker> for RuleCollectionProvider<'_, PP, NP>
impl<PP, NP, LP> DataProvider<TransliteratorRulesV1Marker>
for RuleCollectionProvider<'_, PP, NP, LP>
where
PP: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
Expand Down Expand Up @@ -385,28 +395,35 @@ where
}

macro_rules! redirect {
($($marker:ty),*) => {
($field:ident, $($marker:ty),*) => {
$(
impl<PP: ?Sized, NP: ?Sized + DataProvider<$marker>> DataProvider<$marker> for RuleCollectionProvider<'_, PP, NP> {
impl<PP: ?Sized, NP: ?Sized + DataProvider<$marker>, LP: ?Sized + DataProvider<$marker>> DataProvider<$marker> for RuleCollectionProvider<'_, PP, NP, LP> {
fn load(&self, req: DataRequest) -> Result<DataResponse<$marker>, DataError> {
self.normalizer_provider.load(req)
self.$field.load(req)
}
}
)*
}
}

redirect!(
normalizer_provider,
CanonicalDecompositionDataV1Marker,
CompatibilityDecompositionSupplementV1Marker,
CanonicalDecompositionTablesV1Marker,
CompatibilityDecompositionTablesV1Marker,
CanonicalCompositionsV1Marker
);

redirect!(
locale_provider,
ParentsV1Marker,
LikelySubtagsExtendedV1Marker
);

#[cfg(feature = "datagen")]
impl<PP, NP> IterableDataProvider<TransliteratorRulesV1Marker>
for RuleCollectionProvider<'_, PP, NP>
impl<PP, NP, LP> IterableDataProvider<TransliteratorRulesV1Marker>
for RuleCollectionProvider<'_, PP, NP, LP>
where
PP: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
Expand Down
101 changes: 93 additions & 8 deletions components/experimental/src/transliterate/transliterator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ use core::fmt::Debug;
use core::ops::Range;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_locale::fallback::{LocaleFallbackConfig, LocaleFallbackPriority, LocaleFallbacker};
use icu_locale::provider::*;
use icu_locale_core::Locale;
use icu_normalizer::provider::*;
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
Expand Down Expand Up @@ -250,13 +252,16 @@ impl Transliterator {
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
+ DataProvider<CanonicalCompositionsV1Marker>
+ DataProvider<ParentsV1Marker>
+ DataProvider<LikelySubtagsForLanguageV1Marker>
+ ?Sized,
{
Self::internal_try_new_with_override_unstable(
locale,
None::<&fn(&Locale) -> Option<Box<dyn CustomTransliterator>>>,
provider,
provider,
|| LocaleFallbacker::try_new_unstable(provider),
)
}

Expand Down Expand Up @@ -306,17 +311,26 @@ impl Transliterator {
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
+ DataProvider<CanonicalCompositionsV1Marker>
+ DataProvider<ParentsV1Marker>
+ DataProvider<LikelySubtagsForLanguageV1Marker>
+ ?Sized,
F: Fn(&Locale) -> Option<Box<dyn CustomTransliterator>>,
{
Self::internal_try_new_with_override_unstable(locale, Some(&lookup), provider, provider)
Self::internal_try_new_with_override_unstable(
locale,
Some(&lookup),
provider,
provider,
|| LocaleFallbacker::try_new_unstable(provider),
)
}

fn internal_try_new_with_override_unstable<PN, PT, F>(
locale: Locale,
lookup: Option<&F>,
transliterator_provider: &PT,
normalizer_provider: &PN,
fallbacker: impl Fn() -> Result<LocaleFallbacker, DataError>,
) -> Result<Transliterator, DataError>
where
PT: DataProvider<TransliteratorRulesV1Marker> + ?Sized,
Expand All @@ -328,19 +342,72 @@ impl Transliterator {
+ ?Sized,
F: Fn(&Locale) -> Option<Box<dyn CustomTransliterator>>,
{
let payload = Transliterator::load_rbt(
// TODO(#3950): How is fallback handled with special parts?

// first try loading of locale
let transliterator = if let Ok(transliterator) = Self::load_rbt(
#[allow(clippy::unwrap_used)] // infallible
DataMarkerAttributes::try_from_str(&locale.to_string().to_ascii_lowercase()).unwrap(),
transliterator_provider,
)?;
let rbt = payload.get();
) {
transliterator
} else {
let fallbacker = fallbacker()?;
let mut fallback_config = LocaleFallbackConfig::default();
fallback_config.priority = LocaleFallbackPriority::Script;
let fallbacker = fallbacker.for_config(fallback_config);

let mut transform_extensions = locale.extensions.transform;
let source_id = transform_extensions.lang.take().unwrap_or_default();
let target_id = locale.id;

let mut source_iterator = fallbacker.fallback_for(source_id.into());
let mut target_iterator = fallbacker.fallback_for(target_id.into());

'target: loop {
if target_iterator.get().is_default() {
Err(DataErrorKind::IdentifierNotFound
.with_marker(TransliteratorRulesV1Marker::INFO))?;
}
'source: loop {
if source_iterator.get().is_default() {
break 'source;
}
let mut candidate = target_iterator.get().clone().into_locale();
candidate.extensions.transform = transform_extensions.clone();
candidate.extensions.transform.lang =
Some(icu_locale_core::LanguageIdentifier {
language: source_iterator.get().language,
script: source_iterator.get().script,
region: source_iterator.get().region,
variants: source_iterator
.get()
.variant
.map(icu_locale_core::subtags::Variants::from_variant)
.unwrap_or_default(),
});
if let Ok(t) = Self::load_rbt(
#[allow(clippy::unwrap_used)] // infallible
DataMarkerAttributes::try_from_str(
&candidate.to_string().to_ascii_lowercase(),
)
.unwrap(),
transliterator_provider,
) {
break 'target t;
}
source_iterator.step();
}
target_iterator.step();
}
};
let rbt = transliterator.get();

if !rbt.visibility {
// transliterator is internal
return Err(DataError::custom("internal only transliterator"));
}
let mut env = LiteMap::new();
// Avoid recursive load
env.insert(locale.to_string(), InternalTransliterator::Null);
Transliterator::load_dependencies_recursive(
rbt,
&mut env,
Expand All @@ -349,7 +416,7 @@ impl Transliterator {
normalizer_provider,
)?;
Ok(Transliterator {
transliterator: payload,
transliterator,
env,
})
}
Expand Down Expand Up @@ -476,9 +543,11 @@ impl Transliterator {
where
P: DataProvider<TransliteratorRulesV1Marker> + ?Sized,
{
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
let req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes),
..Default::default()
metadata,
};
let payload = provider.load(req)?.payload;
let rbt = payload.get();
Expand Down Expand Up @@ -1404,6 +1473,22 @@ mod tests {
assert_eq!(t.transliterate(input.to_string()), output);
}

#[test]
fn test_de_ascii_fallback() {
// the actual, existing transliterator has source `und-Latn`. Check that the fallback chain from `fr-CH`
// eventually reaches `und-Latn` and gives us the expected transliterator.
let t = Transliterator::try_new_unstable(
"de-t-fr-ch-d0-ascii".parse().unwrap(),
&TestingProvider,
)
.unwrap();
let input =
"Über ältere Lügner lästern ist sehr a\u{0308}rgerlich. Ja, SEHR ÄRGERLICH! - ꜵ";
let output =
"Ueber aeltere Luegner laestern ist sehr aergerlich. Ja, SEHR AERGERLICH! - ao";
assert_eq!(t.transliterate(input.to_string()), output);
}

#[test]
fn test_override() {
#[derive(Debug)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ struct TestingProvider;

const _: () = {
use icu_normalizer_data::*;
use icu_locale_data::*;
mod icu {
pub(super) use super::icu_experimental as experimental;
pub(super) use icu_normalizer as normalizer;
pub(super) use icu_collections as collections;
pub(super) use icu_locale as locale;
}
self::make_provider!(TestingProvider);
impl_canonical_compositions_v1_marker!(TestingProvider);
Expand All @@ -17,5 +19,7 @@ const _: () = {
impl_compatibility_decomposition_supplement_v1_marker!(TestingProvider);
impl_compatibility_decomposition_tables_v1_marker!(TestingProvider);
impl_uts46_decomposition_supplement_v1_marker!(TestingProvider);
impl_parents_v1_marker!(TestingProvider);
impl_likely_subtags_for_language_v1_marker!(TestingProvider);
impl_transliterator_rules_v1!(TestingProvider);
};
4 changes: 2 additions & 2 deletions provider/source/src/transforms/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ impl DataProvider<TransliteratorRulesV1Marker> for SourceDataProvider {
.transforms()?
.lock()
.expect("poison")
.as_provider_unstable(self, self)?
.as_provider_unstable(self, self, self)?
.load(req)
}
}
Expand All @@ -135,7 +135,7 @@ impl crate::IterableDataProviderCached<TransliteratorRulesV1Marker> for SourceDa
.transforms()?
.lock()
.expect("poison")
.as_provider_unstable(self, self)?
.as_provider_unstable(self, self, self)?
.iter_ids()?
.into_iter()
.map(|id| id.as_borrowed().into_owned())
Expand Down
Loading