1use std::collections::BTreeSet;
18use std::fs;
19use std::io::{BufRead, BufReader};
20use std::iter::FusedIterator;
21use std::ops::Range;
22use std::path::Path;
23
24use thiserror::Error;
25
26use crate::dic::category_type::CategoryType;
27use crate::prelude::*;
28
29#[derive(Error, Debug, Eq, PartialEq)]
31#[non_exhaustive]
32pub enum Error {
33 #[error("Invalid format at line {0}")]
34 InvalidFormat(usize),
35
36 #[error("Invalid type {1} at line {0}")]
37 InvalidCategoryType(usize, String),
38
39 #[error("Multiple definition for type {1} at line {0}")]
40 MultipleTypeDefinition(usize, String),
41
42 #[error("Invalid character {0:X} at line {1}")]
43 InvalidChar(u32, usize),
44}
45
46#[derive(Debug, Clone, PartialEq, Eq)]
47struct CatRange {
48 begin: u32,
49 end: u32,
50 categories: CategoryType,
51}
52
53#[derive(Debug, Clone)]
55pub struct CharacterCategory {
56 boundaries: Vec<u32>,
64
65 categories: Vec<CategoryType>,
70}
71
72impl Default for CharacterCategory {
73 fn default() -> Self {
74 CharacterCategory {
75 boundaries: Vec::new(),
76 categories: vec![CategoryType::DEFAULT],
77 }
78 }
79}
80
81impl CharacterCategory {
82 pub fn from_file(path: &Path) -> SudachiResult<CharacterCategory> {
84 let reader = BufReader::new(fs::File::open(path)?);
85 Self::from_reader(reader)
86 }
87
88 pub fn from_bytes(bytes: &[u8]) -> SudachiResult<CharacterCategory> {
89 let reader = BufReader::new(bytes);
90 Self::from_reader(reader)
91 }
92
93 pub fn from_reader<T: BufRead>(data: T) -> SudachiResult<CharacterCategory> {
94 let ranges = Self::read_character_definition(data)?;
95 Ok(Self::compile(&ranges))
96 }
97
98 fn read_character_definition<T: BufRead>(reader: T) -> SudachiResult<Vec<CatRange>> {
112 let mut ranges: Vec<CatRange> = Vec::new();
113 for (i, line) in reader.lines().enumerate() {
114 let line = line?;
115 let line = line.trim();
116 if line.is_empty() || line.starts_with('#') || !line.starts_with("0x") {
117 continue;
118 }
119
120 let cols: Vec<_> = line.split_whitespace().collect();
121 if cols.len() < 2 {
122 return Err(SudachiError::InvalidCharacterCategory(
123 Error::InvalidFormat(i),
124 ));
125 }
126
127 let r: Vec<_> = cols[0].split("..").collect();
128 let begin = u32::from_str_radix(String::from(r[0]).trim_start_matches("0x"), 16)?;
129 let end = if r.len() > 1 {
130 u32::from_str_radix(String::from(r[1]).trim_start_matches("0x"), 16)? + 1
131 } else {
132 begin + 1
133 };
134 if begin >= end {
135 return Err(SudachiError::InvalidCharacterCategory(
136 Error::InvalidFormat(i),
137 ));
138 }
139 if char::from_u32(begin).is_none() {
140 return Err(SudachiError::InvalidCharacterCategory(Error::InvalidChar(
141 begin, i,
142 )));
143 }
144
145 if char::from_u32(end).is_none() {
146 return Err(SudachiError::InvalidCharacterCategory(Error::InvalidChar(
147 end, i,
148 )));
149 }
150
151 let mut categories = CategoryType::empty();
152 for elem in cols[1..].iter().take_while(|elem| !elem.starts_with('#')) {
153 categories.insert(match elem.parse() {
154 Ok(t) => t,
155 Err(_) => {
156 return Err(SudachiError::InvalidCharacterCategory(
157 Error::InvalidCategoryType(i, elem.to_string()),
158 ))
159 }
160 });
161 }
162
163 ranges.push(CatRange {
164 begin,
165 end,
166 categories,
167 });
168 }
169
170 Ok(ranges)
171 }
172
173 fn compile(ranges: &Vec<CatRange>) -> CharacterCategory {
178 if ranges.is_empty() {
179 return CharacterCategory::default();
180 }
181
182 let boundaries = Self::collect_boundaries(ranges);
183 let mut categories = vec![CategoryType::empty(); boundaries.len()];
184
185 for range in ranges {
186 let start_idx = match boundaries.binary_search(&range.begin) {
187 Ok(i) => i + 1,
188 Err(_) => panic!("there can not be not found boundaries"),
189 };
190 for i in start_idx..boundaries.len() {
192 if boundaries[i] > range.end {
193 break;
194 }
195 categories[i] |= range.categories;
196 }
197 }
198
199 debug_assert_eq!(categories[0], CategoryType::empty());
201 categories[0] = CategoryType::DEFAULT;
202 let mut final_boundaries = Vec::with_capacity(boundaries.len());
204 let mut final_categories = Vec::with_capacity(categories.len());
205
206 let mut last_category = categories[0];
207 let mut last_boundary = boundaries[0];
208 for i in 1..categories.len() {
209 if categories[i] == last_category {
210 last_boundary = boundaries[i];
211 continue;
212 }
213 final_boundaries.push(last_boundary);
214 final_categories.push(last_category);
215 last_category = categories[i];
216 last_boundary = boundaries[i];
217 }
218
219 final_categories.push(last_category);
220 final_boundaries.push(last_boundary);
221
222 for cat in final_categories.iter_mut() {
224 if cat.is_empty() {
225 *cat = CategoryType::DEFAULT;
226 }
227 }
228
229 final_categories.push(CategoryType::DEFAULT);
231
232 final_boundaries.shrink_to_fit();
233 final_categories.shrink_to_fit();
234
235 CharacterCategory {
236 boundaries: final_boundaries,
237 categories: final_categories,
238 }
239 }
240
241 fn collect_boundaries(data: &Vec<CatRange>) -> Vec<u32> {
243 let mut boundaries = BTreeSet::new();
244 for i in data {
245 boundaries.insert(i.begin);
246 boundaries.insert(i.end);
247 }
248 boundaries.into_iter().collect()
249 }
250
251 pub fn get_category_types(&self, c: char) -> CategoryType {
253 if self.boundaries.is_empty() {
254 return CategoryType::DEFAULT;
255 }
256 let cint = c as u32;
257 match self.boundaries.binary_search(&cint) {
258 Ok(idx) => self.categories[idx + 1],
260 Err(idx) => self.categories[idx],
262 }
263 }
264
265 pub fn iter(&self) -> CharCategoryIter {
266 CharCategoryIter {
267 categories: self,
268 current: 0,
269 }
270 }
271}
272
273pub struct CharCategoryIter<'a> {
274 categories: &'a CharacterCategory,
275 current: usize,
276}
277
278impl Iterator for CharCategoryIter<'_> {
279 type Item = (Range<char>, CategoryType);
280
281 fn next(&mut self) -> Option<Self::Item> {
282 if self.current == self.categories.boundaries.len() + 1 {
283 return None;
284 }
285
286 let range = if self.current == self.categories.boundaries.len() {
288 let left = char::from_u32(*self.categories.boundaries.last().unwrap()).unwrap();
289 (left..char::MAX, *self.categories.categories.last().unwrap())
290 } else if self.current == 0 {
291 let right = char::from_u32(*self.categories.boundaries.first().unwrap()).unwrap();
292 let r = (0 as char)..right;
293 (r, self.categories.categories[0])
294 } else {
295 let left = char::from_u32(self.categories.boundaries[self.current - 1]).unwrap();
296 let right = char::from_u32(self.categories.boundaries[self.current]).unwrap();
297 let cat = self.categories.categories[self.current];
298 (left..right, cat)
299 };
300
301 self.current += 1;
302 Some(range)
303 }
304}
305
306impl FusedIterator for CharCategoryIter<'_> {}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311 use claim::assert_matches;
312 use std::path::PathBuf;
313
314 const TEST_RESOURCE_DIR: &str = "./tests/resources/";
315 const TEST_CHAR_DEF_FILE: &str = "char.def";
316
317 #[test]
318 fn get_category_types() {
319 let path = PathBuf::from(TEST_RESOURCE_DIR).join(TEST_CHAR_DEF_FILE);
320 let cat = CharacterCategory::from_file(&path).expect("failed to load char.def for test");
321 let cats = cat.get_category_types('熙');
322 assert_eq!(1, cats.count());
323 assert!(cats.contains(CategoryType::KANJI));
324 }
325
326 fn read_categories(data: &str) -> CharacterCategory {
327 let ranges = CharacterCategory::read_character_definition(data.as_bytes())
328 .expect("error when parsing character categories");
329 CharacterCategory::compile(&ranges)
330 }
331
332 type CT = CategoryType;
333
334 #[test]
335 fn read_cdef_1() {
336 let cat = read_categories(
337 "
338 0x0030..0x0039 NUMERIC
339 0x0032 KANJI",
340 );
341 assert_eq!(cat.get_category_types('\u{0030}'), CT::NUMERIC);
342 assert_eq!(cat.get_category_types('\u{0031}'), CT::NUMERIC);
343 assert_eq!(cat.get_category_types('\u{0032}'), CT::NUMERIC | CT::KANJI);
344 assert_eq!(cat.get_category_types('\u{0033}'), CT::NUMERIC);
345 assert_eq!(cat.get_category_types('\u{0039}'), CT::NUMERIC);
346 }
347
348 #[test]
349 fn read_cdef_2() {
350 let cat = read_categories(
351 "
352 0x0030..0x0039 NUMERIC
353 0x0070..0x0079 ALPHA
354 0x3007 KANJI
355 0x0030 KANJI",
356 );
357 assert_eq!(cat.get_category_types('\u{0030}'), CT::NUMERIC | CT::KANJI);
358 assert_eq!(cat.get_category_types('\u{0039}'), CT::NUMERIC);
359 assert_eq!(cat.get_category_types('\u{3007}'), CT::KANJI);
360 assert_eq!(cat.get_category_types('\u{0069}'), CT::DEFAULT);
361 assert_eq!(cat.get_category_types('\u{0070}'), CT::ALPHA);
362 assert_eq!(cat.get_category_types('\u{0080}'), CT::DEFAULT);
363 }
364
365 #[test]
366 fn read_cdef_3() {
367 let cat = read_categories(
368 "
369 0x0030..0x0039 KATAKANA
370 0x3007 KANJI KANJINUMERIC
371 0x3008 KANJI KANJINUMERIC
372 0x3009 KANJI KANJINUMERIC
373 0x0039..0x0040 ALPHA
374 0x0030..0x0039 NUMERIC
375 0x0030 KANJI",
376 );
377 assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
378 assert_eq!(
379 cat.get_category_types('\u{0030}'),
380 CT::NUMERIC | CT::KATAKANA | CT::KANJI
381 );
382 assert_eq!(
383 cat.get_category_types('\u{0039}'),
384 CT::NUMERIC | CT::ALPHA | CT::KATAKANA
385 );
386 assert_eq!(cat.get_category_types('\u{0040}'), CT::ALPHA);
387 assert_eq!(cat.get_category_types('\u{0041}'), CT::DEFAULT);
388 assert_eq!(
389 cat.get_category_types('\u{3007}'),
390 CT::KANJI | CT::KANJINUMERIC
391 );
392 assert_eq!(cat.get_category_types('\u{4007}'), CT::DEFAULT);
393 }
394
395 #[test]
396 fn read_cdef_4() {
397 let cat = read_categories(
398 "
399 0x4E00..0x9FFF KANJI
400 0x4E8C KANJI KANJINUMERIC",
401 );
402 assert_eq!(cat.get_category_types('男'), CT::KANJI);
403 assert_eq!(cat.get_category_types('\u{4E8B}'), CT::KANJI);
404 assert_eq!(
405 cat.get_category_types('\u{4E8C}'),
406 CT::KANJI | CT::KANJINUMERIC
407 );
408 assert_eq!(cat.get_category_types('\u{4E8D}'), CT::KANJI);
409 }
410
411 #[test]
412 fn read_cdef_holes_1() {
413 let cat = read_categories(
414 "
415 0x0030 USER1
416 0x0032 USER2",
417 );
418 assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
419 assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
420 assert_eq!(cat.get_category_types('\u{0031}'), CT::DEFAULT);
421 assert_eq!(cat.get_category_types('\u{0032}'), CT::USER2);
422 assert_eq!(cat.get_category_types('\u{0033}'), CT::DEFAULT);
423 }
424
425 #[test]
426 fn read_cdef_merge_1() {
427 let cat = read_categories(
428 "
429 0x0030 USER1
430 0x0031 USER1",
431 );
432 assert_eq!(cat.boundaries.len(), 2);
433 assert_eq!(cat.categories.len(), 3);
434 assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
435 assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
436 assert_eq!(cat.get_category_types('\u{0031}'), CT::USER1);
437 assert_eq!(cat.get_category_types('\u{0032}'), CT::DEFAULT);
438 }
439
440 #[test]
441 fn read_cdef_merge_2() {
442 let cat = read_categories(
443 "
444 0x0030 USER1
445 0x0031..0x0032 USER1",
446 );
447 assert_eq!(cat.boundaries.len(), 2);
448 assert_eq!(cat.categories.len(), 3);
449 assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
450 assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
451 assert_eq!(cat.get_category_types('\u{0031}'), CT::USER1);
452 assert_eq!(cat.get_category_types('\u{0032}'), CT::USER1);
453 assert_eq!(cat.get_category_types('\u{0033}'), CT::DEFAULT);
454 }
455
456 #[test]
457 fn read_cdef_merge_3() {
458 let cat = read_categories(
459 "
460 0x0030..0x0031 USER1
461 0x0032..0x0033 USER1",
462 );
463 assert_eq!(cat.boundaries.len(), 2);
464 assert_eq!(cat.categories.len(), 3);
465 assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
466 assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
467 assert_eq!(cat.get_category_types('\u{0031}'), CT::USER1);
468 assert_eq!(cat.get_category_types('\u{0032}'), CT::USER1);
469 assert_eq!(cat.get_category_types('\u{0033}'), CT::USER1);
470 assert_eq!(cat.get_category_types('\u{0034}'), CT::DEFAULT);
471 }
472
473 #[test]
474 fn read_character_definition_with_invalid_format() {
475 let data = "0x0030..0x0039";
476 let result = CharacterCategory::read_character_definition(data.as_bytes());
477 assert_matches!(
478 result,
479 Err(SudachiError::InvalidCharacterCategory(
480 Error::InvalidFormat(0)
481 ))
482 );
483 }
484
485 #[test]
486 fn read_character_definition_with_invalid_range() {
487 let data = "0x0030..0x0029 NUMERIC";
488 let result = CharacterCategory::read_character_definition(data.as_bytes());
489 assert_matches!(
490 result,
491 Err(SudachiError::InvalidCharacterCategory(
492 Error::InvalidFormat(0)
493 ))
494 );
495 }
496
497 #[test]
498 fn read_character_definition_with_invalid_type() {
499 let data = "0x0030..0x0039 FOO";
500 let result = CharacterCategory::read_character_definition(data.as_bytes());
501 assert_matches!(result, Err(SudachiError::InvalidCharacterCategory(Error::InvalidCategoryType(0, s))) if s == "FOO");
502 }
503
504 #[test]
505 fn check_test_cdef() {
506 let data: &[u8] = include_bytes!("../../tests/resources/char.def");
507 let c = CharacterCategory::from_reader(data).expect("failed to read chars");
508 assert_eq!(c.get_category_types('â'), CT::ALPHA);
509 assert_eq!(c.get_category_types('b'), CT::ALPHA);
510 assert_eq!(c.get_category_types('C'), CT::ALPHA);
511 assert_eq!(c.get_category_types('漢'), CT::KANJI);
512 assert_eq!(c.get_category_types('𡈽'), CT::DEFAULT);
513 assert_eq!(c.get_category_types('ア'), CT::KATAKANA);
514 assert_eq!(c.get_category_types('コ'), CT::KATAKANA);
515 assert_eq!(c.get_category_types('゙'), CT::KATAKANA);
516 }
517
518 #[test]
519 fn iter_cdef_holes_1() {
520 let cat = read_categories(
521 "
522 0x0030 USER1
523 0x0032 USER2",
524 );
525 let mut iter = cat.iter();
526 assert_matches!(
527 iter.next(),
528 Some((
529 Range {
530 start: '\x00',
531 end: '\x30'
532 },
533 CT::DEFAULT
534 ))
535 );
536 assert_matches!(
537 iter.next(),
538 Some((
539 Range {
540 start: '\x30',
541 end: '\x31'
542 },
543 CT::USER1
544 ))
545 );
546 assert_matches!(
547 iter.next(),
548 Some((
549 Range {
550 start: '\x31',
551 end: '\x32'
552 },
553 CT::DEFAULT
554 ))
555 );
556 assert_matches!(
557 iter.next(),
558 Some((
559 Range {
560 start: '\x32',
561 end: '\x33'
562 },
563 CT::USER2
564 ))
565 );
566 assert_matches!(
567 iter.next(),
568 Some((
569 Range {
570 start: '\x33',
571 end: char::MAX
572 },
573 CT::DEFAULT
574 ))
575 );
576 assert_eq!(iter.next(), None);
577 }
578}