@@ -83,12 +83,10 @@ impl From<Script> for ScriptExtension {
83
83
impl TryFrom < ScriptExtension > for Script {
84
84
type Error = ( ) ;
85
85
fn try_from ( ext : ScriptExtension ) -> Result < Self , ( ) > {
86
- if ext. is_common_or_inherited ( ) {
87
- if ext. common {
88
- Ok ( Script :: Common )
89
- } else {
90
- Ok ( Script :: Inherited )
91
- }
86
+ if ext. is_common ( ) {
87
+ Ok ( Script :: Common )
88
+ } else if ext. is_inherited ( ) {
89
+ Ok ( Script :: Inherited )
92
90
} else if ext. is_empty ( ) {
93
91
Ok ( Script :: Unknown )
94
92
} else {
@@ -131,94 +129,88 @@ impl fmt::Display for Script {
131
129
}
132
130
}
133
131
134
- #[ derive( Clone , Copy , PartialEq , Eq , Hash ) ]
132
+ #[ derive( Clone , Copy , PartialEq , Eq , Hash , Debug ) ]
135
133
#[ non_exhaustive]
136
134
/// A value for the `Script_Extension` property
137
135
///
138
136
/// [`ScriptExtension`] is one or more [`Script`]
139
137
///
140
138
/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141
139
pub struct ScriptExtension {
142
- // A bitset for the first 64 scripts
140
+ // A bitset for the first scripts [0..64]
143
141
first : u64 ,
144
- // A bitset for the scripts 65- 128
142
+ // A bitset for the scripts [65.. 128]
145
143
second : u64 ,
146
- // A bitset for scripts after 128
144
+ // A bitset for scripts after [128..NEXT_SCRIPT]
145
+ // The last 2 bits represent whether Common and Inherited is included
146
+ // * Bit 63 indicates whether it includes Common
147
+ // * Bit 64 indicates whether it includes Inherited
147
148
third : u64 ,
148
- // Both Common and Inherited are represented by all used bits being set,
149
- // this flag lets us distinguish the two.
150
- common : bool ,
151
149
}
152
150
153
151
impl ScriptExtension {
154
152
// We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155
153
// Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156
154
// it in the range of `third`, create a u64 with just that bit set, and subtract 1
157
155
// to create one with all the lower bits set.
158
- const THIRD_MAX : u64 = ( ( 1 << ( NEXT_SCRIPT - 128 ) ) - 1 ) ;
156
+ const _CHECK: ( ) = assert ! ( NEXT_SCRIPT - 128 < 63 ) ;
157
+ const COMMON_MASK : u64 = ( 1 << 62 ) ; // 63rd bit
158
+ const INHERITED_MASK : u64 = ( 1 << 63 ) ; // 64th bit
159
159
160
160
pub ( crate ) const fn new ( first : u64 , second : u64 , third : u64 ) -> Self {
161
161
ScriptExtension {
162
162
first,
163
163
second,
164
164
third,
165
- common : false ,
166
165
}
167
166
}
168
167
168
+ /// Returns a ScriptExtension containing only Common.
169
169
pub ( crate ) const fn new_common ( ) -> Self {
170
170
ScriptExtension {
171
- first : u64:: MAX ,
172
- second : u64:: MAX ,
173
- third : Self :: THIRD_MAX ,
174
- common : true ,
171
+ first : 0 ,
172
+ second : 0 ,
173
+ third : Self :: COMMON_MASK ,
175
174
}
176
175
}
177
176
177
+ /// Returns a ScriptExtension containing only Inherited.
178
178
pub ( crate ) const fn new_inherited ( ) -> Self {
179
179
ScriptExtension {
180
- first : u64:: MAX ,
181
- second : u64:: MAX ,
182
- third : Self :: THIRD_MAX ,
183
- common : false ,
180
+ first : 0 ,
181
+ second : 0 ,
182
+ third : Self :: INHERITED_MASK ,
184
183
}
185
184
}
186
185
186
+ /// Returns an empty ScriptExtension
187
187
pub ( crate ) const fn new_unknown ( ) -> Self {
188
188
ScriptExtension {
189
189
first : 0 ,
190
190
second : 0 ,
191
191
third : 0 ,
192
- common : false ,
193
192
}
194
193
}
195
194
196
- const fn is_common_or_inherited ( self ) -> bool {
197
- ( self . first == u64:: MAX ) & ( self . second == u64:: MAX ) & ( self . third == Self :: THIRD_MAX )
198
- }
199
-
200
195
/// Checks if the script extension is Common
201
196
pub const fn is_common ( self ) -> bool {
202
- self . is_common_or_inherited ( ) & self . common
197
+ ( self . third & Self :: COMMON_MASK ) != 0
203
198
}
204
199
205
200
/// Checks if the script extension is Inherited
206
201
pub const fn is_inherited ( self ) -> bool {
207
- self . is_common_or_inherited ( ) & ! self . common
202
+ ( self . third & Self :: INHERITED_MASK ) != 0
208
203
}
209
204
210
205
/// Checks if the script extension is empty (unknown)
211
206
pub const fn is_empty ( self ) -> bool {
212
207
( self . first == 0 ) & ( self . second == 0 ) & ( self . third == 0 )
213
208
}
214
209
215
- /// Returns the number of scripts in the script extension
210
+ /// Returns the number of scripts in the script extension. Common and
211
+ /// Inherited, if present, are included and counted independently in the return value.
216
212
pub fn len ( self ) -> usize {
217
- if self . is_common_or_inherited ( ) {
218
- 1
219
- } else {
220
- ( self . first . count_ones ( ) + self . second . count_ones ( ) + self . third . count_ones ( ) ) as usize
221
- }
213
+ ( self . first . count_ones ( ) + self . second . count_ones ( ) + self . third . count_ones ( ) ) as usize
222
214
}
223
215
224
216
/// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
@@ -233,54 +225,47 @@ impl ScriptExtension {
233
225
234
226
/// Find the intersection between two ScriptExtensions. Returns Unknown if things
235
227
/// do not intersect.
236
- ///
237
- /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238
- /// everything, the intersection of `Common` and `Inherited` is `Inherited`
239
228
pub const fn intersection ( self , other : Self ) -> Self {
240
229
let first = self . first & other. first ;
241
230
let second = self . second & other. second ;
242
231
let third = self . third & other. third ;
243
- let common = self . common & other. common ;
244
232
ScriptExtension {
245
233
first,
246
234
second,
247
235
third,
248
- common,
249
236
}
250
237
}
251
238
252
239
/// Find the union between two ScriptExtensions.
253
- ///
254
- /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255
- /// everything, the union of `Common` and `Inherited` is `Common`
256
240
pub const fn union ( self , other : Self ) -> Self {
257
241
let first = self . first | other. first ;
258
242
let second = self . second | other. second ;
259
243
let third = self . third | other. third ;
260
- let common = self . common | other. common ;
261
244
ScriptExtension {
262
245
first,
263
246
second,
264
247
third,
265
- common,
266
248
}
267
249
}
268
250
251
+ /// Returns true if and only if all members of `self` are present in `other`.
252
+ pub fn is_subset_or_equal ( self , other : Self ) -> bool {
253
+ self . intersection ( other) == self && self . union ( other) == other
254
+ }
255
+
269
256
/// Check if this ScriptExtension contains the given script
270
- ///
271
- /// Should be used with specific scripts only, this will
272
- /// return `true` if `self` is not `Unknown` and `script` is
273
- /// `Common` or `Inherited`
274
257
pub fn contains_script ( self , script : Script ) -> bool {
275
258
!self . intersection ( script. into ( ) ) . is_empty ( )
276
259
}
277
260
278
- /// Get the intersection of script extensions of all characters
279
- /// in a string.
261
+ /// Get the script extension representing the union of all scripts for
262
+ /// the characters in a string.
263
+ ///
264
+ /// This is likely to decay to Unknown. You probably want to use `for_str_union()` instead.
280
265
pub fn for_str ( x : & str ) -> Self {
281
- let mut ext = ScriptExtension :: default ( ) ;
266
+ let mut ext = ScriptExtension :: new_unknown ( ) ;
282
267
for ch in x. chars ( ) {
283
- ext. intersect_with ( ch. into ( ) ) ;
268
+ ext = ext . union ( ch. into ( ) ) ;
284
269
}
285
270
ext
286
271
}
@@ -311,33 +296,23 @@ impl From<&'_ str> for ScriptExtension {
311
296
}
312
297
}
313
298
314
- impl fmt:: Debug for ScriptExtension {
315
- fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
316
- write ! ( f, "ScriptExtension(" ) ?;
317
- fmt:: Display :: fmt ( self , f) ?;
318
- write ! ( f, ")" )
319
- }
320
- }
321
-
322
299
impl fmt:: Display for ScriptExtension {
323
300
fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
324
- if self . is_common ( ) {
325
- write ! ( f, "Common" ) ?;
326
- } else if self . is_inherited ( ) {
327
- write ! ( f, "Inherited" ) ?;
328
- } else if self . is_empty ( ) {
301
+ write ! ( f, "ScriptExtension(" ) ?;
302
+ if self . is_empty ( ) {
329
303
write ! ( f, "Unknown" ) ?;
330
304
} else {
331
305
let mut first = true ;
332
306
for script in self . iter ( ) {
333
- if !first {
334
- write ! ( f, " + " ) ?;
307
+ if first {
335
308
first = false ;
309
+ } else {
310
+ write ! ( f, " + " ) ?;
336
311
}
337
312
script. full_name ( ) . fmt ( f) ?;
338
313
}
339
314
}
340
- Ok ( ( ) )
315
+ write ! ( f , ")" )
341
316
}
342
317
}
343
318
@@ -361,7 +336,7 @@ impl UnicodeScript for char {
361
336
362
337
/// Iterator over scripts in a [ScriptExtension].
363
338
///
364
- /// Can be obtained ia [ScriptExtension::iter()]
339
+ /// Can be obtained via [ScriptExtension::iter()]
365
340
pub struct ScriptIterator {
366
341
ext : ScriptExtension ,
367
342
}
@@ -370,26 +345,31 @@ impl Iterator for ScriptIterator {
370
345
type Item = Script ;
371
346
372
347
fn next ( & mut self ) -> Option < Script > {
373
- if self . ext . is_common_or_inherited ( ) {
374
- let common = self . ext . common ;
375
- self . ext = ScriptExtension :: new_unknown ( ) ;
376
- if common {
377
- Some ( Script :: Common )
378
- } else {
379
- Some ( Script :: Inherited )
380
- }
348
+ if self . ext . is_inherited ( ) {
349
+ // If `self.ext` is both Inherited and Common, this
350
+ // temporarily constructs an invalid ScriptExtension. We don't
351
+ // use `self.ext` for anything other than iterating over bits,
352
+ // so this is okay.
353
+ self . ext . third &= !ScriptExtension :: INHERITED_MASK ;
354
+ Some ( Script :: Inherited )
355
+ } else if self . ext . is_common ( ) {
356
+ self . ext . third &= !ScriptExtension :: COMMON_MASK ;
357
+ Some ( Script :: Common )
358
+
381
359
// Are there bits left in the first chunk?
382
360
} else if self . ext . first != 0 {
383
361
// Find the next bit
384
362
let bit = self . ext . first . trailing_zeros ( ) ;
385
363
// unset just that bit
386
364
self . ext . first &= !( 1 << bit) ;
387
365
Some ( Script :: for_integer ( bit as u8 ) )
366
+
388
367
// Are there bits left in the second chunk?
389
368
} else if self . ext . second != 0 {
390
369
let bit = self . ext . second . trailing_zeros ( ) ;
391
370
self . ext . second &= !( 1 << bit) ;
392
371
Some ( Script :: for_integer ( 64 + bit as u8 ) )
372
+
393
373
// Are there bits left in the third chunk?
394
374
} else if self . ext . third != 0 {
395
375
let bit = self . ext . third . trailing_zeros ( ) ;
@@ -429,8 +409,8 @@ mod tests {
429
409
seen_scripts. insert ( script) ;
430
410
seen_exts. insert ( ext) ;
431
411
assert_eq ! ( script as u8 , bit) ;
432
- assert ! ( ! ScriptExtension :: new_common( ) . intersection( ext) . is_empty( ) ) ;
433
- assert ! ( ! ScriptExtension :: new_inherited( )
412
+ assert ! ( ScriptExtension :: new_common( ) . intersection( ext) . is_empty( ) ) ;
413
+ assert ! ( ScriptExtension :: new_inherited( )
434
414
. intersection( ext)
435
415
. is_empty( ) ) ;
436
416
assert ! ( ScriptExtension :: new_unknown( ) . intersection( ext) . is_empty( ) ) ;
@@ -443,13 +423,13 @@ mod tests {
443
423
fn test_specific ( ) {
444
424
let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे." ;
445
425
let ext = ScriptExtension :: for_str ( s) ;
446
- assert_eq ! ( ext , script_extensions:: DEVA ) ;
426
+ assert ! ( script_extensions:: DEVA . is_subset_or_equal ( ext ) ) ;
447
427
println ! (
448
- "{:? }" ,
428
+ "{}" ,
449
429
script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450
430
) ;
451
431
println ! (
452
- "{:? }" ,
432
+ "{}" ,
453
433
ext. intersection(
454
434
script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455
435
)
@@ -461,7 +441,9 @@ mod tests {
461
441
let u = ext. union ( Script :: Dogra . into ( ) ) ;
462
442
assert_eq ! (
463
443
u. intersection(
464
- script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
444
+ script_extensions:: COMMON . union (
445
+ script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
446
+ )
465
447
) ,
466
448
u
467
449
) ;
@@ -499,6 +481,68 @@ mod tests {
499
481
assert ! ( scr. is_err( ) ) ;
500
482
}
501
483
484
+ #[ test]
485
+ fn test_subsets_and_iter ( ) {
486
+ let cases: & [ ( ScriptExtension , & [ Script ] ) ] = & [
487
+ ( ScriptExtension :: new_inherited ( ) , & [ Script :: Inherited ] ) ,
488
+ ( ScriptExtension :: new_common ( ) , & [ Script :: Common ] ) ,
489
+ (
490
+ ScriptExtension :: new_inherited ( ) . union ( script_extensions:: COMMON ) ,
491
+ & [ Script :: Inherited , Script :: Common ] ,
492
+ ) ,
493
+ (
494
+ ScriptExtension :: new_inherited ( )
495
+ . union ( script_extensions:: COMMON )
496
+ . union ( script_extensions:: LATIN ) ,
497
+ & [ Script :: Inherited , Script :: Common , Script :: Latin ] ,
498
+ ) ,
499
+ (
500
+ ScriptExtension :: new_inherited ( )
501
+ . union ( script_extensions:: COMMON )
502
+ . union ( script_extensions:: LATIN )
503
+ . union ( script_extensions:: CYRILLIC ) ,
504
+ & [
505
+ Script :: Inherited ,
506
+ Script :: Common ,
507
+ Script :: Cyrillic ,
508
+ Script :: Latin ,
509
+ ] ,
510
+ ) ,
511
+ ] ;
512
+ for & ( full_extension, component_scripts) in cases {
513
+ for & script in component_scripts. iter ( ) {
514
+ assert ! ( full_extension. contains_script( script) ) ;
515
+ let cur = script. into ( ) ;
516
+ let intersect = full_extension. intersection ( cur) ;
517
+ let union = full_extension. union ( cur) ;
518
+ assert_eq ! ( intersect, cur) ;
519
+ assert_eq ! ( union , full_extension) ;
520
+
521
+ assert ! ( cur. is_subset_or_equal( cur) ) ;
522
+ assert ! ( cur. is_subset_or_equal( intersect) ) ;
523
+ assert ! ( cur. is_subset_or_equal( full_extension) ) ;
524
+ assert ! ( cur. is_subset_or_equal( union ) ) ;
525
+ if component_scripts. len ( ) > 1 {
526
+ assert ! ( !full_extension. is_subset_or_equal( cur) ) ;
527
+ assert ! ( !union . is_subset_or_equal( cur) ) ;
528
+ }
529
+
530
+ assert ! ( intersect. is_subset_or_equal( intersect) ) ;
531
+ assert ! ( intersect. is_subset_or_equal( full_extension) ) ;
532
+ assert ! ( intersect. is_subset_or_equal( union ) ) ;
533
+ if component_scripts. len ( ) > 1 {
534
+ assert ! ( !full_extension. is_subset_or_equal( intersect) ) ;
535
+ assert ! ( !union . is_subset_or_equal( intersect) ) ;
536
+ }
537
+
538
+ assert ! ( union . is_subset_or_equal( union ) ) ;
539
+ }
540
+ let scripts = component_scripts. iter ( ) . cloned ( ) . collect :: < Vec < _ > > ( ) ;
541
+ let scripts_iterated = full_extension. iter ( ) . collect :: < Vec < _ > > ( ) ;
542
+ assert_eq ! ( scripts, scripts_iterated) ;
543
+ }
544
+ }
545
+
502
546
#[ cfg( feature = "bench" ) ]
503
547
#[ bench]
504
548
fn bench_script_intersection ( b : & mut Bencher ) {
0 commit comments