sudachi/dic/
mod.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::path::Path;
18
19use crate::analysis::stateless_tokenizer::DictionaryAccess;
20use character_category::CharacterCategory;
21use grammar::Grammar;
22use header::Header;
23use lexicon::Lexicon;
24use lexicon_set::LexiconSet;
25
26use crate::plugin::input_text::InputTextPlugin;
27use crate::plugin::oov::OovProviderPlugin;
28use crate::plugin::path_rewrite::PathRewritePlugin;
29use crate::prelude::*;
30
31pub mod build;
32pub mod category_type;
33pub mod character_category;
34pub mod connect;
35pub mod dictionary;
36pub mod grammar;
37pub mod header;
38pub mod lexicon;
39pub mod lexicon_set;
40pub mod read;
41pub mod storage;
42pub mod subset;
43pub mod word_id;
44
45const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def");
46const POS_DEPTH: usize = 6;
47
48/// A dictionary consists of one system_dict and zero or more user_dicts
49pub struct LoadedDictionary<'a> {
50    pub grammar: Grammar<'a>,
51    pub lexicon_set: LexiconSet<'a>,
52}
53
54impl<'a> LoadedDictionary<'a> {
55    /// Creates a system dictionary from bytes, and preloaded character category
56    pub fn from_system_dictionary_and_chardef(
57        dictionary_bytes: &'a [u8],
58        character_category: CharacterCategory,
59    ) -> SudachiResult<LoadedDictionary<'a>> {
60        let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?;
61
62        let mut grammar = system_dict
63            .grammar
64            .ok_or(SudachiError::InvalidDictionaryGrammar)?;
65        grammar.set_character_category(character_category);
66
67        let num_system_pos = grammar.pos_list.len();
68        Ok(LoadedDictionary {
69            grammar,
70            lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos),
71        })
72    }
73
74    /// Creates a system dictionary from bytes, and load a character category from file
75    pub fn from_system_dictionary(
76        dictionary_bytes: &'a [u8],
77        character_category_file: &Path,
78    ) -> SudachiResult<LoadedDictionary<'a>> {
79        let character_category = CharacterCategory::from_file(character_category_file)?;
80        Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)
81    }
82
83    /// Creates a system dictionary from bytes, and load embedded default character category
84    pub fn from_system_dictionary_embedded(
85        dictionary_bytes: &'a [u8],
86    ) -> SudachiResult<LoadedDictionary<'a>> {
87        let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?;
88        Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)
89    }
90
91    #[cfg(test)]
92    pub(crate) fn merge_dictionary(
93        mut self,
94        other: DictionaryLoader<'a>,
95    ) -> SudachiResult<LoadedDictionary> {
96        let npos = self.grammar.pos_list.len();
97        let lexicon = other.lexicon;
98        let grammar = other.grammar;
99        self.lexicon_set.append(lexicon, npos)?;
100        if let Some(g) = grammar {
101            self.grammar.merge(g)
102        }
103        Ok(self)
104    }
105}
106
107impl<'a> DictionaryAccess for LoadedDictionary<'a> {
108    fn grammar(&self) -> &Grammar<'a> {
109        &self.grammar
110    }
111
112    fn lexicon(&self) -> &LexiconSet<'a> {
113        &self.lexicon_set
114    }
115
116    fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
117        &[]
118    }
119
120    fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
121        &[]
122    }
123
124    fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
125        &[]
126    }
127}
128
129/// A single system or user dictionary
130pub struct DictionaryLoader<'a> {
131    pub header: Header,
132    pub grammar: Option<Grammar<'a>>,
133    pub lexicon: Lexicon<'a>,
134}
135
136impl<'a> DictionaryLoader<'a> {
137    /// Creates a binary dictionary from bytes
138    ///
139    /// # Safety
140    /// This function is marked unsafe because it does not perform header validation
141    pub unsafe fn read_any_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
142        let header = Header::parse(&dictionary_bytes[..Header::STORAGE_SIZE])?;
143        let mut offset = Header::STORAGE_SIZE;
144
145        let grammar = if header.has_grammar() {
146            let tmp = Grammar::parse(dictionary_bytes, offset)?;
147            offset += tmp.storage_size;
148            Some(tmp)
149        } else {
150            None
151        };
152
153        let lexicon = Lexicon::parse(dictionary_bytes, offset, header.has_synonym_group_ids())?;
154
155        Ok(DictionaryLoader {
156            header,
157            grammar,
158            lexicon,
159        })
160    }
161
162    /// Creates a system binary dictionary from bytes
163    ///
164    /// Returns Err if header version is not match
165    pub fn read_system_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
166        let dict = unsafe { Self::read_any_dictionary(dictionary_bytes) }?;
167        match dict.header.version {
168            header::HeaderVersion::SystemDict(_) => Ok(dict),
169            _ => Err(SudachiError::InvalidHeader(
170                header::HeaderError::InvalidSystemDictVersion,
171            )),
172        }
173    }
174
175    /// Creates a user binary dictionary from bytes
176    ///
177    /// Returns Err if header version is not match
178    pub fn read_user_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
179        let dict = unsafe { Self::read_any_dictionary(dictionary_bytes) }?;
180        match dict.header.version {
181            header::HeaderVersion::UserDict(_) => Ok(dict),
182            _ => Err(SudachiError::InvalidHeader(
183                header::HeaderError::InvalidSystemDictVersion,
184            )),
185        }
186    }
187
188    pub fn to_loaded(self) -> Option<LoadedDictionary<'a>> {
189        let mut lexicon = self.lexicon;
190        lexicon.set_dic_id(0);
191        match self.grammar {
192            None => None,
193            Some(grammar) => {
194                let num_system_pos = grammar.pos_list.len();
195                Some(LoadedDictionary {
196                    grammar,
197                    lexicon_set: LexiconSet::new(lexicon, num_system_pos),
198                })
199            }
200        }
201    }
202}