Skip to content

Commit 80526e0

Browse files
committed
ScriptExtension now can represent combinations of scripts and Inherited/Common
After finding the iterator after a union with Common only yielded a single element, I overhauled the representation and semantics of `ScriptExtension`. This is a breaking change for most APIs. Summary of improvements to `ScriptExtension`: * Improved representation to be able to track multiple scripts as well as Inherited/Common * "Inherited" and "Common" no longer intersect with everything and have no subset/superset relationship between them. * `for_str` is a union, not intersection, of all chars * Added `is_subset_or_equal()` for easier comparison of unions and intersections * Changed `Debug` impl to a vanilla derive to allow comparing hex bits * Fixed `Display` impl to properly show each script, separated by pluses * New test for iterator
1 parent 1f84c2e commit 80526e0

File tree

1 file changed

+129
-85
lines changed

1 file changed

+129
-85
lines changed

src/lib.rs

Lines changed: 129 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,10 @@ impl From<Script> for ScriptExtension {
8383
impl TryFrom<ScriptExtension> for Script {
8484
type Error = ();
8585
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
86-
if ext.is_common_or_inherited() {
87-
if ext.common {
88-
Ok(Script::Common)
89-
} else {
90-
Ok(Script::Inherited)
91-
}
86+
if ext.is_common() {
87+
Ok(Script::Common)
88+
} else if ext.is_inherited() {
89+
Ok(Script::Inherited)
9290
} else if ext.is_empty() {
9391
Ok(Script::Unknown)
9492
} else {
@@ -131,94 +129,88 @@ impl fmt::Display for Script {
131129
}
132130
}
133131

134-
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
132+
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
135133
#[non_exhaustive]
136134
/// A value for the `Script_Extension` property
137135
///
138136
/// [`ScriptExtension`] is one or more [`Script`]
139137
///
140138
/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141139
pub struct ScriptExtension {
142-
// A bitset for the first 64 scripts
140+
// A bitset for the first scripts [0..64]
143141
first: u64,
144-
// A bitset for the scripts 65-128
142+
// A bitset for the scripts [65..128]
145143
second: u64,
146-
// A bitset for scripts after 128
144+
// A bitset for scripts after [128..NEXT_SCRIPT]
145+
// The last 2 bits represent whether Common and Inherited is included
146+
// * Bit 63 indicates whether it includes Common
147+
// * Bit 64 indicates whether it includes Inherited
147148
third: u64,
148-
// Both Common and Inherited are represented by all used bits being set,
149-
// this flag lets us distinguish the two.
150-
common: bool,
151149
}
152150

153151
impl ScriptExtension {
154152
// We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155153
// Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156154
// it in the range of `third`, create a u64 with just that bit set, and subtract 1
157155
// to create one with all the lower bits set.
158-
const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1);
156+
const _CHECK: () = assert!(NEXT_SCRIPT - 128 < 63);
157+
const COMMON_MASK: u64 = (1 << 62); // 63rd bit
158+
const INHERITED_MASK: u64 = (1 << 63); // 64th bit
159159

160160
pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self {
161161
ScriptExtension {
162162
first,
163163
second,
164164
third,
165-
common: false,
166165
}
167166
}
168167

168+
/// Returns a ScriptExtension containing only Common.
169169
pub(crate) const fn new_common() -> Self {
170170
ScriptExtension {
171-
first: u64::MAX,
172-
second: u64::MAX,
173-
third: Self::THIRD_MAX,
174-
common: true,
171+
first: 0,
172+
second: 0,
173+
third: Self::COMMON_MASK,
175174
}
176175
}
177176

177+
/// Returns a ScriptExtension containing only Inherited.
178178
pub(crate) const fn new_inherited() -> Self {
179179
ScriptExtension {
180-
first: u64::MAX,
181-
second: u64::MAX,
182-
third: Self::THIRD_MAX,
183-
common: false,
180+
first: 0,
181+
second: 0,
182+
third: Self::INHERITED_MASK,
184183
}
185184
}
186185

186+
/// Returns an empty ScriptExtension
187187
pub(crate) const fn new_unknown() -> Self {
188188
ScriptExtension {
189189
first: 0,
190190
second: 0,
191191
third: 0,
192-
common: false,
193192
}
194193
}
195194

196-
const fn is_common_or_inherited(self) -> bool {
197-
(self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
198-
}
199-
200195
/// Checks if the script extension is Common
201196
pub const fn is_common(self) -> bool {
202-
self.is_common_or_inherited() & self.common
197+
(self.third & Self::COMMON_MASK) != 0
203198
}
204199

205200
/// Checks if the script extension is Inherited
206201
pub const fn is_inherited(self) -> bool {
207-
self.is_common_or_inherited() & !self.common
202+
(self.third & Self::INHERITED_MASK) != 0
208203
}
209204

210205
/// Checks if the script extension is empty (unknown)
211206
pub const fn is_empty(self) -> bool {
212207
(self.first == 0) & (self.second == 0) & (self.third == 0)
213208
}
214209

215-
/// Returns the number of scripts in the script extension
210+
/// Returns the number of scripts in the script extension. Common and
211+
/// Inherited, if present, are included and counted independently in the return value.
216212
pub fn len(self) -> usize {
217-
if self.is_common_or_inherited() {
218-
1
219-
} else {
220-
(self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
221-
}
213+
(self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
222214
}
223215

224216
/// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
@@ -233,54 +225,47 @@ impl ScriptExtension {
233225

234226
/// Find the intersection between two ScriptExtensions. Returns Unknown if things
235227
/// do not intersect.
236-
///
237-
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238-
/// everything, the intersection of `Common` and `Inherited` is `Inherited`
239228
pub const fn intersection(self, other: Self) -> Self {
240229
let first = self.first & other.first;
241230
let second = self.second & other.second;
242231
let third = self.third & other.third;
243-
let common = self.common & other.common;
244232
ScriptExtension {
245233
first,
246234
second,
247235
third,
248-
common,
249236
}
250237
}
251238

252239
/// Find the union between two ScriptExtensions.
253-
///
254-
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255-
/// everything, the union of `Common` and `Inherited` is `Common`
256240
pub const fn union(self, other: Self) -> Self {
257241
let first = self.first | other.first;
258242
let second = self.second | other.second;
259243
let third = self.third | other.third;
260-
let common = self.common | other.common;
261244
ScriptExtension {
262245
first,
263246
second,
264247
third,
265-
common,
266248
}
267249
}
268250

251+
/// Returns true if and only if all members of `self` are present in `other`.
252+
pub fn is_subset_or_equal(self, other: Self) -> bool {
253+
self.intersection(other) == self && self.union(other) == other
254+
}
255+
269256
/// Check if this ScriptExtension contains the given script
270-
///
271-
/// Should be used with specific scripts only, this will
272-
/// return `true` if `self` is not `Unknown` and `script` is
273-
/// `Common` or `Inherited`
274257
pub fn contains_script(self, script: Script) -> bool {
275258
!self.intersection(script.into()).is_empty()
276259
}
277260

278-
/// Get the intersection of script extensions of all characters
279-
/// in a string.
261+
/// Get the script extension representing the union of all scripts for
262+
/// the characters in a string.
263+
///
264+
/// This is likely to decay to Unknown. You probably want to use `for_str_union()` instead.
280265
pub fn for_str(x: &str) -> Self {
281-
let mut ext = ScriptExtension::default();
266+
let mut ext = ScriptExtension::new_unknown();
282267
for ch in x.chars() {
283-
ext.intersect_with(ch.into());
268+
ext = ext.union(ch.into());
284269
}
285270
ext
286271
}
@@ -311,33 +296,23 @@ impl From<&'_ str> for ScriptExtension {
311296
}
312297
}
313298

314-
impl fmt::Debug for ScriptExtension {
315-
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
316-
write!(f, "ScriptExtension(")?;
317-
fmt::Display::fmt(self, f)?;
318-
write!(f, ")")
319-
}
320-
}
321-
322299
impl fmt::Display for ScriptExtension {
323300
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
324-
if self.is_common() {
325-
write!(f, "Common")?;
326-
} else if self.is_inherited() {
327-
write!(f, "Inherited")?;
328-
} else if self.is_empty() {
301+
write!(f, "ScriptExtension(")?;
302+
if self.is_empty() {
329303
write!(f, "Unknown")?;
330304
} else {
331305
let mut first = true;
332306
for script in self.iter() {
333-
if !first {
334-
write!(f, " + ")?;
307+
if first {
335308
first = false;
309+
} else {
310+
write!(f, " + ")?;
336311
}
337312
script.full_name().fmt(f)?;
338313
}
339314
}
340-
Ok(())
315+
write!(f, ")")
341316
}
342317
}
343318

@@ -361,7 +336,7 @@ impl UnicodeScript for char {
361336

362337
/// Iterator over scripts in a [ScriptExtension].
363338
///
364-
/// Can be obtained ia [ScriptExtension::iter()]
339+
/// Can be obtained via [ScriptExtension::iter()]
365340
pub struct ScriptIterator {
366341
ext: ScriptExtension,
367342
}
@@ -370,26 +345,31 @@ impl Iterator for ScriptIterator {
370345
type Item = Script;
371346

372347
fn next(&mut self) -> Option<Script> {
373-
if self.ext.is_common_or_inherited() {
374-
let common = self.ext.common;
375-
self.ext = ScriptExtension::new_unknown();
376-
if common {
377-
Some(Script::Common)
378-
} else {
379-
Some(Script::Inherited)
380-
}
348+
if self.ext.is_inherited() {
349+
// If `self.ext` is both Inherited and Common, this
350+
// temporarily constructs an invalid ScriptExtension. We don't
351+
// use `self.ext` for anything other than iterating over bits,
352+
// so this is okay.
353+
self.ext.third &= !ScriptExtension::INHERITED_MASK;
354+
Some(Script::Inherited)
355+
} else if self.ext.is_common() {
356+
self.ext.third &= !ScriptExtension::COMMON_MASK;
357+
Some(Script::Common)
358+
381359
// Are there bits left in the first chunk?
382360
} else if self.ext.first != 0 {
383361
// Find the next bit
384362
let bit = self.ext.first.trailing_zeros();
385363
// unset just that bit
386364
self.ext.first &= !(1 << bit);
387365
Some(Script::for_integer(bit as u8))
366+
388367
// Are there bits left in the second chunk?
389368
} else if self.ext.second != 0 {
390369
let bit = self.ext.second.trailing_zeros();
391370
self.ext.second &= !(1 << bit);
392371
Some(Script::for_integer(64 + bit as u8))
372+
393373
// Are there bits left in the third chunk?
394374
} else if self.ext.third != 0 {
395375
let bit = self.ext.third.trailing_zeros();
@@ -429,8 +409,8 @@ mod tests {
429409
seen_scripts.insert(script);
430410
seen_exts.insert(ext);
431411
assert_eq!(script as u8, bit);
432-
assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
433-
assert!(!ScriptExtension::new_inherited()
412+
assert!(ScriptExtension::new_common().intersection(ext).is_empty());
413+
assert!(ScriptExtension::new_inherited()
434414
.intersection(ext)
435415
.is_empty());
436416
assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
@@ -443,13 +423,13 @@ mod tests {
443423
fn test_specific() {
444424
let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
445425
let ext = ScriptExtension::for_str(s);
446-
assert_eq!(ext, script_extensions::DEVA);
426+
assert!(script_extensions::DEVA.is_subset_or_equal(ext));
447427
println!(
448-
"{:?}",
428+
"{}",
449429
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450430
);
451431
println!(
452-
"{:?}",
432+
"{}",
453433
ext.intersection(
454434
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455435
)
@@ -461,7 +441,9 @@ mod tests {
461441
let u = ext.union(Script::Dogra.into());
462442
assert_eq!(
463443
u.intersection(
464-
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
444+
script_extensions::COMMON.union(
445+
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
446+
)
465447
),
466448
u
467449
);
@@ -499,6 +481,68 @@ mod tests {
499481
assert!(scr.is_err());
500482
}
501483

484+
#[test]
485+
fn test_subsets_and_iter() {
486+
let cases: &[(ScriptExtension, &[Script])] = &[
487+
(ScriptExtension::new_inherited(), &[Script::Inherited]),
488+
(ScriptExtension::new_common(), &[Script::Common]),
489+
(
490+
ScriptExtension::new_inherited().union(script_extensions::COMMON),
491+
&[Script::Inherited, Script::Common],
492+
),
493+
(
494+
ScriptExtension::new_inherited()
495+
.union(script_extensions::COMMON)
496+
.union(script_extensions::LATIN),
497+
&[Script::Inherited, Script::Common, Script::Latin],
498+
),
499+
(
500+
ScriptExtension::new_inherited()
501+
.union(script_extensions::COMMON)
502+
.union(script_extensions::LATIN)
503+
.union(script_extensions::CYRILLIC),
504+
&[
505+
Script::Inherited,
506+
Script::Common,
507+
Script::Cyrillic,
508+
Script::Latin,
509+
],
510+
),
511+
];
512+
for &(full_extension, component_scripts) in cases {
513+
for &script in component_scripts.iter() {
514+
assert!(full_extension.contains_script(script));
515+
let cur = script.into();
516+
let intersect = full_extension.intersection(cur);
517+
let union = full_extension.union(cur);
518+
assert_eq!(intersect, cur);
519+
assert_eq!(union, full_extension);
520+
521+
assert!(cur.is_subset_or_equal(cur));
522+
assert!(cur.is_subset_or_equal(intersect));
523+
assert!(cur.is_subset_or_equal(full_extension));
524+
assert!(cur.is_subset_or_equal(union));
525+
if component_scripts.len() > 1 {
526+
assert!(!full_extension.is_subset_or_equal(cur));
527+
assert!(!union.is_subset_or_equal(cur));
528+
}
529+
530+
assert!(intersect.is_subset_or_equal(intersect));
531+
assert!(intersect.is_subset_or_equal(full_extension));
532+
assert!(intersect.is_subset_or_equal(union));
533+
if component_scripts.len() > 1 {
534+
assert!(!full_extension.is_subset_or_equal(intersect));
535+
assert!(!union.is_subset_or_equal(intersect));
536+
}
537+
538+
assert!(union.is_subset_or_equal(union));
539+
}
540+
let scripts = component_scripts.iter().cloned().collect::<Vec<_>>();
541+
let scripts_iterated = full_extension.iter().collect::<Vec<_>>();
542+
assert_eq!(scripts, scripts_iterated);
543+
}
544+
}
545+
502546
#[cfg(feature = "bench")]
503547
#[bench]
504548
fn bench_script_intersection(b: &mut Bencher) {

0 commit comments

Comments
 (0)