sudachi/dic/
character_category.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::collections::BTreeSet;
18use std::fs;
19use std::io::{BufRead, BufReader};
20use std::iter::FusedIterator;
21use std::ops::Range;
22use std::path::Path;
23
24use thiserror::Error;
25
26use crate::dic::category_type::CategoryType;
27use crate::prelude::*;
28
29/// Sudachi error
30#[derive(Error, Debug, Eq, PartialEq)]
31#[non_exhaustive]
32pub enum Error {
33    #[error("Invalid format at line {0}")]
34    InvalidFormat(usize),
35
36    #[error("Invalid type {1} at line {0}")]
37    InvalidCategoryType(usize, String),
38
39    #[error("Multiple definition for type {1} at line {0}")]
40    MultipleTypeDefinition(usize, String),
41
42    #[error("Invalid character {0:X} at line {1}")]
43    InvalidChar(u32, usize),
44}
45
46#[derive(Debug, Clone, PartialEq, Eq)]
47struct CatRange {
48    begin: u32,
49    end: u32,
50    categories: CategoryType,
51}
52
53/// CharacterCategory holds mapping from character to character category type
54#[derive(Debug, Clone)]
55pub struct CharacterCategory {
56    /// Split the whole domain of codepoints into ranges,
57    /// limited by boundaries.
58    ///
59    /// Ranges are half-open: `[boundaries[i], boundaries[i + 1])`
60    /// meaning that the right bound is not included.
61    /// 0 and u32::MAX are not stored, they are included implicitly
62    /// as if they would have indices of `-1` and `boundaries.len()`.
63    boundaries: Vec<u32>,
64
65    /// Stores the category for each range.
66    /// `categories[i]` is for the range `[boundaries[i - 1], boundaries[i])`.
67    /// Plays well with [`std::slice::binary_search`], see [`get_category_types()`].
68    /// This should be always true: `boundaries.len() + 1 == categories.len()`.
69    categories: Vec<CategoryType>,
70}
71
72impl Default for CharacterCategory {
73    fn default() -> Self {
74        CharacterCategory {
75            boundaries: Vec::new(),
76            categories: vec![CategoryType::DEFAULT],
77        }
78    }
79}
80
81impl CharacterCategory {
82    /// Creates a character category from file
83    pub fn from_file(path: &Path) -> SudachiResult<CharacterCategory> {
84        let reader = BufReader::new(fs::File::open(path)?);
85        Self::from_reader(reader)
86    }
87
88    pub fn from_bytes(bytes: &[u8]) -> SudachiResult<CharacterCategory> {
89        let reader = BufReader::new(bytes);
90        Self::from_reader(reader)
91    }
92
93    pub fn from_reader<T: BufRead>(data: T) -> SudachiResult<CharacterCategory> {
94        let ranges = Self::read_character_definition(data)?;
95        Ok(Self::compile(&ranges))
96    }
97
98    /// Reads character type definition as a list of Ranges
99    ///
100    /// Definition file syntax:
101    ///     Each line contains [TARGET_CHARACTER_CODE_POINT] [TYPES], where
102    ///     TARGET_CHARACTER_CODE_POINT:
103    ///         a code_point in hexadecimal format or two separated by ".."
104    ///     TYPES:
105    ///         one or more Category_types separated by white space
106    ///     Loads only lines start with "0x" are loaded and ignore others
107    ///
108    /// Definition example:
109    ///     "0x0030..0x0039 NUMERIC"
110    ///     "0x3008         KANJI KANJINUMERIC"
111    fn read_character_definition<T: BufRead>(reader: T) -> SudachiResult<Vec<CatRange>> {
112        let mut ranges: Vec<CatRange> = Vec::new();
113        for (i, line) in reader.lines().enumerate() {
114            let line = line?;
115            let line = line.trim();
116            if line.is_empty() || line.starts_with('#') || !line.starts_with("0x") {
117                continue;
118            }
119
120            let cols: Vec<_> = line.split_whitespace().collect();
121            if cols.len() < 2 {
122                return Err(SudachiError::InvalidCharacterCategory(
123                    Error::InvalidFormat(i),
124                ));
125            }
126
127            let r: Vec<_> = cols[0].split("..").collect();
128            let begin = u32::from_str_radix(String::from(r[0]).trim_start_matches("0x"), 16)?;
129            let end = if r.len() > 1 {
130                u32::from_str_radix(String::from(r[1]).trim_start_matches("0x"), 16)? + 1
131            } else {
132                begin + 1
133            };
134            if begin >= end {
135                return Err(SudachiError::InvalidCharacterCategory(
136                    Error::InvalidFormat(i),
137                ));
138            }
139            if char::from_u32(begin).is_none() {
140                return Err(SudachiError::InvalidCharacterCategory(Error::InvalidChar(
141                    begin, i,
142                )));
143            }
144
145            if char::from_u32(end).is_none() {
146                return Err(SudachiError::InvalidCharacterCategory(Error::InvalidChar(
147                    end, i,
148                )));
149            }
150
151            let mut categories = CategoryType::empty();
152            for elem in cols[1..].iter().take_while(|elem| !elem.starts_with('#')) {
153                categories.insert(match elem.parse() {
154                    Ok(t) => t,
155                    Err(_) => {
156                        return Err(SudachiError::InvalidCharacterCategory(
157                            Error::InvalidCategoryType(i, elem.to_string()),
158                        ))
159                    }
160                });
161            }
162
163            ranges.push(CatRange {
164                begin,
165                end,
166                categories,
167            });
168        }
169
170        Ok(ranges)
171    }
172
173    /// Creates a character category from given range_list
174    ///
175    /// Transforms given range_list to non overlapped range list
176    /// to apply binary search in get_category_types
177    fn compile(ranges: &Vec<CatRange>) -> CharacterCategory {
178        if ranges.is_empty() {
179            return CharacterCategory::default();
180        }
181
182        let boundaries = Self::collect_boundaries(ranges);
183        let mut categories = vec![CategoryType::empty(); boundaries.len()];
184
185        for range in ranges {
186            let start_idx = match boundaries.binary_search(&range.begin) {
187                Ok(i) => i + 1,
188                Err(_) => panic!("there can not be not found boundaries"),
189            };
190            // apply category to all splits which are included in the current range
191            for i in start_idx..boundaries.len() {
192                if boundaries[i] > range.end {
193                    break;
194                }
195                categories[i] |= range.categories;
196            }
197        }
198
199        // first category is always default (it is impossible to get it assigned above)
200        debug_assert_eq!(categories[0], CategoryType::empty());
201        categories[0] = CategoryType::DEFAULT;
202        // merge successive ranges of the same category
203        let mut final_boundaries = Vec::with_capacity(boundaries.len());
204        let mut final_categories = Vec::with_capacity(categories.len());
205
206        let mut last_category = categories[0];
207        let mut last_boundary = boundaries[0];
208        for i in 1..categories.len() {
209            if categories[i] == last_category {
210                last_boundary = boundaries[i];
211                continue;
212            }
213            final_boundaries.push(last_boundary);
214            final_categories.push(last_category);
215            last_category = categories[i];
216            last_boundary = boundaries[i];
217        }
218
219        final_categories.push(last_category);
220        final_boundaries.push(last_boundary);
221
222        // replace empty categories with default
223        for cat in final_categories.iter_mut() {
224            if cat.is_empty() {
225                *cat = CategoryType::DEFAULT;
226            }
227        }
228
229        // and add the category after the last boundary
230        final_categories.push(CategoryType::DEFAULT);
231
232        final_boundaries.shrink_to_fit();
233        final_categories.shrink_to_fit();
234
235        CharacterCategory {
236            boundaries: final_boundaries,
237            categories: final_categories,
238        }
239    }
240
241    /// Find sorted list of all boundaries
242    fn collect_boundaries(data: &Vec<CatRange>) -> Vec<u32> {
243        let mut boundaries = BTreeSet::new();
244        for i in data {
245            boundaries.insert(i.begin);
246            boundaries.insert(i.end);
247        }
248        boundaries.into_iter().collect()
249    }
250
251    /// Returns a set of category types which given char has
252    pub fn get_category_types(&self, c: char) -> CategoryType {
253        if self.boundaries.is_empty() {
254            return CategoryType::DEFAULT;
255        }
256        let cint = c as u32;
257        match self.boundaries.binary_search(&cint) {
258            //Ok means the index in boundaries, so the next category
259            Ok(idx) => self.categories[idx + 1],
260            //Err means the insertion index, so the current category
261            Err(idx) => self.categories[idx],
262        }
263    }
264
265    pub fn iter(&self) -> CharCategoryIter {
266        CharCategoryIter {
267            categories: self,
268            current: 0,
269        }
270    }
271}
272
273pub struct CharCategoryIter<'a> {
274    categories: &'a CharacterCategory,
275    current: usize,
276}
277
278impl Iterator for CharCategoryIter<'_> {
279    type Item = (Range<char>, CategoryType);
280
281    fn next(&mut self) -> Option<Self::Item> {
282        if self.current == self.categories.boundaries.len() + 1 {
283            return None;
284        }
285
286        // char casts are safe, we are checking for correctness during the data load
287        let range = if self.current == self.categories.boundaries.len() {
288            let left = char::from_u32(*self.categories.boundaries.last().unwrap()).unwrap();
289            (left..char::MAX, *self.categories.categories.last().unwrap())
290        } else if self.current == 0 {
291            let right = char::from_u32(*self.categories.boundaries.first().unwrap()).unwrap();
292            let r = (0 as char)..right;
293            (r, self.categories.categories[0])
294        } else {
295            let left = char::from_u32(self.categories.boundaries[self.current - 1]).unwrap();
296            let right = char::from_u32(self.categories.boundaries[self.current]).unwrap();
297            let cat = self.categories.categories[self.current];
298            (left..right, cat)
299        };
300
301        self.current += 1;
302        Some(range)
303    }
304}
305
306impl FusedIterator for CharCategoryIter<'_> {}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311    use claim::assert_matches;
312    use std::path::PathBuf;
313
314    const TEST_RESOURCE_DIR: &str = "./tests/resources/";
315    const TEST_CHAR_DEF_FILE: &str = "char.def";
316
317    #[test]
318    fn get_category_types() {
319        let path = PathBuf::from(TEST_RESOURCE_DIR).join(TEST_CHAR_DEF_FILE);
320        let cat = CharacterCategory::from_file(&path).expect("failed to load char.def for test");
321        let cats = cat.get_category_types('熙');
322        assert_eq!(1, cats.count());
323        assert!(cats.contains(CategoryType::KANJI));
324    }
325
326    fn read_categories(data: &str) -> CharacterCategory {
327        let ranges = CharacterCategory::read_character_definition(data.as_bytes())
328            .expect("error when parsing character categories");
329        CharacterCategory::compile(&ranges)
330    }
331
332    type CT = CategoryType;
333
334    #[test]
335    fn read_cdef_1() {
336        let cat = read_categories(
337            "
338            0x0030..0x0039 NUMERIC
339            0x0032         KANJI",
340        );
341        assert_eq!(cat.get_category_types('\u{0030}'), CT::NUMERIC);
342        assert_eq!(cat.get_category_types('\u{0031}'), CT::NUMERIC);
343        assert_eq!(cat.get_category_types('\u{0032}'), CT::NUMERIC | CT::KANJI);
344        assert_eq!(cat.get_category_types('\u{0033}'), CT::NUMERIC);
345        assert_eq!(cat.get_category_types('\u{0039}'), CT::NUMERIC);
346    }
347
348    #[test]
349    fn read_cdef_2() {
350        let cat = read_categories(
351            "
352            0x0030..0x0039 NUMERIC
353            0x0070..0x0079 ALPHA
354            0x3007         KANJI
355            0x0030         KANJI",
356        );
357        assert_eq!(cat.get_category_types('\u{0030}'), CT::NUMERIC | CT::KANJI);
358        assert_eq!(cat.get_category_types('\u{0039}'), CT::NUMERIC);
359        assert_eq!(cat.get_category_types('\u{3007}'), CT::KANJI);
360        assert_eq!(cat.get_category_types('\u{0069}'), CT::DEFAULT);
361        assert_eq!(cat.get_category_types('\u{0070}'), CT::ALPHA);
362        assert_eq!(cat.get_category_types('\u{0080}'), CT::DEFAULT);
363    }
364
365    #[test]
366    fn read_cdef_3() {
367        let cat = read_categories(
368            "
369            0x0030..0x0039 KATAKANA
370            0x3007         KANJI KANJINUMERIC
371            0x3008         KANJI KANJINUMERIC
372            0x3009         KANJI KANJINUMERIC
373            0x0039..0x0040 ALPHA
374            0x0030..0x0039 NUMERIC
375            0x0030         KANJI",
376        );
377        assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
378        assert_eq!(
379            cat.get_category_types('\u{0030}'),
380            CT::NUMERIC | CT::KATAKANA | CT::KANJI
381        );
382        assert_eq!(
383            cat.get_category_types('\u{0039}'),
384            CT::NUMERIC | CT::ALPHA | CT::KATAKANA
385        );
386        assert_eq!(cat.get_category_types('\u{0040}'), CT::ALPHA);
387        assert_eq!(cat.get_category_types('\u{0041}'), CT::DEFAULT);
388        assert_eq!(
389            cat.get_category_types('\u{3007}'),
390            CT::KANJI | CT::KANJINUMERIC
391        );
392        assert_eq!(cat.get_category_types('\u{4007}'), CT::DEFAULT);
393    }
394
395    #[test]
396    fn read_cdef_4() {
397        let cat = read_categories(
398            "
399            0x4E00..0x9FFF KANJI
400            0x4E8C         KANJI KANJINUMERIC",
401        );
402        assert_eq!(cat.get_category_types('男'), CT::KANJI);
403        assert_eq!(cat.get_category_types('\u{4E8B}'), CT::KANJI);
404        assert_eq!(
405            cat.get_category_types('\u{4E8C}'),
406            CT::KANJI | CT::KANJINUMERIC
407        );
408        assert_eq!(cat.get_category_types('\u{4E8D}'), CT::KANJI);
409    }
410
411    #[test]
412    fn read_cdef_holes_1() {
413        let cat = read_categories(
414            "
415            0x0030 USER1
416            0x0032 USER2",
417        );
418        assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
419        assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
420        assert_eq!(cat.get_category_types('\u{0031}'), CT::DEFAULT);
421        assert_eq!(cat.get_category_types('\u{0032}'), CT::USER2);
422        assert_eq!(cat.get_category_types('\u{0033}'), CT::DEFAULT);
423    }
424
425    #[test]
426    fn read_cdef_merge_1() {
427        let cat = read_categories(
428            "
429            0x0030 USER1
430            0x0031 USER1",
431        );
432        assert_eq!(cat.boundaries.len(), 2);
433        assert_eq!(cat.categories.len(), 3);
434        assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
435        assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
436        assert_eq!(cat.get_category_types('\u{0031}'), CT::USER1);
437        assert_eq!(cat.get_category_types('\u{0032}'), CT::DEFAULT);
438    }
439
440    #[test]
441    fn read_cdef_merge_2() {
442        let cat = read_categories(
443            "
444            0x0030          USER1
445            0x0031..0x0032  USER1",
446        );
447        assert_eq!(cat.boundaries.len(), 2);
448        assert_eq!(cat.categories.len(), 3);
449        assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
450        assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
451        assert_eq!(cat.get_category_types('\u{0031}'), CT::USER1);
452        assert_eq!(cat.get_category_types('\u{0032}'), CT::USER1);
453        assert_eq!(cat.get_category_types('\u{0033}'), CT::DEFAULT);
454    }
455
456    #[test]
457    fn read_cdef_merge_3() {
458        let cat = read_categories(
459            "
460            0x0030..0x0031  USER1
461            0x0032..0x0033  USER1",
462        );
463        assert_eq!(cat.boundaries.len(), 2);
464        assert_eq!(cat.categories.len(), 3);
465        assert_eq!(cat.get_category_types('\u{0029}'), CT::DEFAULT);
466        assert_eq!(cat.get_category_types('\u{0030}'), CT::USER1);
467        assert_eq!(cat.get_category_types('\u{0031}'), CT::USER1);
468        assert_eq!(cat.get_category_types('\u{0032}'), CT::USER1);
469        assert_eq!(cat.get_category_types('\u{0033}'), CT::USER1);
470        assert_eq!(cat.get_category_types('\u{0034}'), CT::DEFAULT);
471    }
472
473    #[test]
474    fn read_character_definition_with_invalid_format() {
475        let data = "0x0030..0x0039";
476        let result = CharacterCategory::read_character_definition(data.as_bytes());
477        assert_matches!(
478            result,
479            Err(SudachiError::InvalidCharacterCategory(
480                Error::InvalidFormat(0)
481            ))
482        );
483    }
484
485    #[test]
486    fn read_character_definition_with_invalid_range() {
487        let data = "0x0030..0x0029 NUMERIC";
488        let result = CharacterCategory::read_character_definition(data.as_bytes());
489        assert_matches!(
490            result,
491            Err(SudachiError::InvalidCharacterCategory(
492                Error::InvalidFormat(0)
493            ))
494        );
495    }
496
497    #[test]
498    fn read_character_definition_with_invalid_type() {
499        let data = "0x0030..0x0039 FOO";
500        let result = CharacterCategory::read_character_definition(data.as_bytes());
501        assert_matches!(result, Err(SudachiError::InvalidCharacterCategory(Error::InvalidCategoryType(0, s))) if s == "FOO");
502    }
503
504    #[test]
505    fn check_test_cdef() {
506        let data: &[u8] = include_bytes!("../../tests/resources/char.def");
507        let c = CharacterCategory::from_reader(data).expect("failed to read chars");
508        assert_eq!(c.get_category_types('â'), CT::ALPHA);
509        assert_eq!(c.get_category_types('b'), CT::ALPHA);
510        assert_eq!(c.get_category_types('C'), CT::ALPHA);
511        assert_eq!(c.get_category_types('漢'), CT::KANJI);
512        assert_eq!(c.get_category_types('𡈽'), CT::DEFAULT);
513        assert_eq!(c.get_category_types('ア'), CT::KATAKANA);
514        assert_eq!(c.get_category_types('コ'), CT::KATAKANA);
515        assert_eq!(c.get_category_types('゙'), CT::KATAKANA);
516    }
517
518    #[test]
519    fn iter_cdef_holes_1() {
520        let cat = read_categories(
521            "
522            0x0030 USER1
523            0x0032 USER2",
524        );
525        let mut iter = cat.iter();
526        assert_matches!(
527            iter.next(),
528            Some((
529                Range {
530                    start: '\x00',
531                    end: '\x30'
532                },
533                CT::DEFAULT
534            ))
535        );
536        assert_matches!(
537            iter.next(),
538            Some((
539                Range {
540                    start: '\x30',
541                    end: '\x31'
542                },
543                CT::USER1
544            ))
545        );
546        assert_matches!(
547            iter.next(),
548            Some((
549                Range {
550                    start: '\x31',
551                    end: '\x32'
552                },
553                CT::DEFAULT
554            ))
555        );
556        assert_matches!(
557            iter.next(),
558            Some((
559                Range {
560                    start: '\x32',
561                    end: '\x33'
562                },
563                CT::USER2
564            ))
565        );
566        assert_matches!(
567            iter.next(),
568            Some((
569                Range {
570                    start: '\x33',
571                    end: char::MAX
572                },
573                CT::DEFAULT
574            ))
575        );
576        assert_eq!(iter.next(), None);
577    }
578}