sudachi/dic/
lexicon_set.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use thiserror::Error;
18
19use crate::dic::lexicon::word_infos::{WordInfo, WordInfoData};
20use crate::dic::lexicon::{Lexicon, LexiconEntry, MAX_DICTIONARIES};
21use crate::dic::subset::InfoSubset;
22use crate::dic::word_id::WordId;
23use crate::prelude::*;
24
25/// Sudachi error
26#[derive(Error, Debug, Eq, PartialEq)]
27pub enum LexiconSetError {
28    #[error("too large word_id {0} in dict {1}")]
29    TooLargeWordId(u32, usize),
30
31    #[error("too large dictionary_id {0}")]
32    TooLargeDictionaryId(usize),
33
34    #[error("too many user dictionaries")]
35    TooManyDictionaries,
36}
37
38/// Set of Lexicons
39///
40/// Handles multiple lexicons as one lexicon
41/// The first lexicon in the list must be from system dictionary
42pub struct LexiconSet<'a> {
43    lexicons: Vec<Lexicon<'a>>,
44    pos_offsets: Vec<usize>,
45    num_system_pos: usize,
46}
47
48impl<'a> LexiconSet<'a> {
49    /// Creates a LexiconSet given a lexicon
50    ///
51    /// It is assumed that the passed lexicon is the system dictionary
52    pub fn new(mut system_lexicon: Lexicon, num_system_pos: usize) -> LexiconSet {
53        system_lexicon.set_dic_id(0);
54        LexiconSet {
55            lexicons: vec![system_lexicon],
56            pos_offsets: vec![0],
57            num_system_pos,
58        }
59    }
60
61    /// Add a lexicon to the lexicon list
62    ///
63    /// pos_offset: number of pos in the grammar
64    pub fn append(
65        &mut self,
66        mut lexicon: Lexicon<'a>,
67        pos_offset: usize,
68    ) -> Result<(), LexiconSetError> {
69        if self.is_full() {
70            return Err(LexiconSetError::TooManyDictionaries);
71        }
72        lexicon.set_dic_id(self.lexicons.len() as u8);
73        self.lexicons.push(lexicon);
74        self.pos_offsets.push(pos_offset);
75        Ok(())
76    }
77
78    /// Returns if dictionary capacity is full
79    pub fn is_full(&self) -> bool {
80        self.lexicons.len() >= MAX_DICTIONARIES
81    }
82}
83
84impl LexiconSet<'_> {
85    /// Returns iterator which yields all words in the dictionary, starting from the `offset` bytes
86    ///
87    /// Searches dictionaries in the reverse order: user dictionaries first and then system dictionary
88    #[inline]
89    pub fn lookup<'b>(
90        &'b self,
91        input: &'b [u8],
92        offset: usize,
93    ) -> impl Iterator<Item = LexiconEntry> + 'b {
94        // word_id fixup was moved to lexicon itself
95        self.lexicons
96            .iter()
97            .rev()
98            .flat_map(move |l| l.lookup(input, offset))
99    }
100
101    /// Returns WordInfo for given WordId
102    pub fn get_word_info(&self, id: WordId) -> SudachiResult<WordInfo> {
103        self.get_word_info_subset(id, InfoSubset::all())
104    }
105
106    /// Returns WordInfo for given WordId.
107    /// Only fills a requested subset of fields.
108    /// Rest will be of default values (0 or empty).
109    pub fn get_word_info_subset(&self, id: WordId, subset: InfoSubset) -> SudachiResult<WordInfo> {
110        let dict_id = id.dic();
111        let mut word_info: WordInfoData = self.lexicons[dict_id as usize]
112            .get_word_info(id.word(), subset)?
113            .into();
114
115        if subset.contains(InfoSubset::POS_ID) {
116            let pos_id = word_info.pos_id as usize;
117            if dict_id > 0 && pos_id >= self.num_system_pos {
118                // user defined part-of-speech
119                word_info.pos_id =
120                    (pos_id - self.num_system_pos + self.pos_offsets[dict_id as usize]) as u16;
121            }
122        }
123
124        if subset.contains(InfoSubset::SPLIT_A) {
125            Self::update_dict_id(&mut word_info.a_unit_split, dict_id)?;
126        }
127
128        if subset.contains(InfoSubset::SPLIT_B) {
129            Self::update_dict_id(&mut word_info.b_unit_split, dict_id)?;
130        }
131
132        if subset.contains(InfoSubset::WORD_STRUCTURE) {
133            Self::update_dict_id(&mut word_info.word_structure, dict_id)?;
134        }
135
136        Ok(word_info.into())
137    }
138
139    /// Returns word_param for given word_id
140    pub fn get_word_param(&self, id: WordId) -> (i16, i16, i16) {
141        let dic_id = id.dic() as usize;
142        self.lexicons[dic_id].get_word_param(id.word())
143    }
144
145    fn update_dict_id(split: &mut Vec<WordId>, dict_id: u8) -> SudachiResult<()> {
146        for id in split.iter_mut() {
147            let cur_dict_id = id.dic();
148            if cur_dict_id > 0 {
149                // update if target word is not in system_dict
150                *id = WordId::checked(dict_id, id.word())?;
151            }
152        }
153        Ok(())
154    }
155
156    pub fn size(&self) -> u32 {
157        self.lexicons.iter().fold(0, |acc, lex| acc + lex.size())
158    }
159}
sudachi/dic/lexicon_set.rs

sudachi/dic/
lexicon_set.rs