sudachi/dic/
dictionary.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use std::fs::File;
18use std::path::Path;
19
20use memmap2::Mmap;
21
22use crate::analysis::stateless_tokenizer::DictionaryAccess;
23use crate::config::Config;
24use crate::dic::grammar::Grammar;
25use crate::dic::lexicon_set::LexiconSet;
26use crate::dic::storage::{Storage, SudachiDicData};
27use crate::dic::{DictionaryLoader, LoadedDictionary};
28use crate::error::{SudachiError, SudachiResult};
29use crate::plugin::input_text::InputTextPlugin;
30use crate::plugin::oov::OovProviderPlugin;
31use crate::plugin::path_rewrite::PathRewritePlugin;
32use crate::plugin::Plugins;
33
34// It is self-referential struct with 'static lifetime as a workaround
35// for the impossibility to specify the correct lifetime for
36// those fields. Accessor functions always provide the correct lifetime,
37// tied to the lifetime of the struct itself.
38// It is safe to move this structure around because the
39// pointers from memory mappings themselves are stable and
40// will not change if the structure will be moved around.
41// This structure is always read only after creation and is safe to share
42// between threads.
43pub struct JapaneseDictionary {
44    storage: SudachiDicData,
45    plugins: Plugins,
46    //'static is a a lie, lifetime is the same with StorageBackend
47    _grammar: Grammar<'static>,
48    //'static is a a lie, lifetime is the same with StorageBackend
49    _lexicon: LexiconSet<'static>,
50}
51
52fn map_file(path: &Path) -> SudachiResult<Storage> {
53    let file = File::open(path)?;
54    let mapping = unsafe { Mmap::map(&file) }?;
55    Ok(Storage::File(mapping))
56}
57
58fn load_system_dic(cfg: &Config) -> SudachiResult<Storage> {
59    let p = cfg.resolved_system_dict()?;
60    map_file(&p).map_err(|e| e.with_context(p.as_os_str().to_string_lossy()))
61}
62
63impl JapaneseDictionary {
64    /// Creates a dictionary from the specified configuration
65    /// Dictionaries will be read from disk
66    pub fn from_cfg(cfg: &Config) -> SudachiResult<JapaneseDictionary> {
67        let mut sb = SudachiDicData::new(load_system_dic(cfg)?);
68
69        for udic in cfg.resolved_user_dicts()? {
70            sb.add_user(
71                map_file(&udic).map_err(|e| e.with_context(udic.as_os_str().to_string_lossy()))?,
72            )
73        }
74
75        Self::from_cfg_storage(cfg, sb)
76    }
77
78    /// Creates a dictionary from the specified configuration and storage
79    pub fn from_cfg_storage(
80        cfg: &Config,
81        storage: SudachiDicData,
82    ) -> SudachiResult<JapaneseDictionary> {
83        let mut basic_dict = LoadedDictionary::from_system_dictionary(
84            unsafe { storage.system_static_slice() },
85            cfg.complete_path(&cfg.character_definition_file)?.as_path(),
86        )?;
87
88        let plugins = {
89            let grammar = &mut basic_dict.grammar;
90            Plugins::load(cfg, grammar)?
91        };
92
93        if plugins.oov.is_empty() {
94            return Err(SudachiError::NoOOVPluginProvided);
95        }
96
97        for p in plugins.connect_cost.plugins() {
98            p.edit(&mut basic_dict.grammar);
99        }
100
101        let mut dic = JapaneseDictionary {
102            storage,
103            plugins,
104            _grammar: basic_dict.grammar,
105            _lexicon: basic_dict.lexicon_set,
106        };
107
108        // this Vec is needed to prevent double borrowing of dic
109        let user_dicts: Vec<_> = dic.storage.user_static_slice();
110        for udic in user_dicts {
111            dic = dic.merge_user_dictionary(udic)?;
112        }
113
114        Ok(dic)
115    }
116
117    /// Creates a dictionary from the specified configuration and storage, with embedded character definition
118    pub fn from_cfg_storage_with_embedded_chardef(
119        cfg: &Config,
120        storage: SudachiDicData,
121    ) -> SudachiResult<JapaneseDictionary> {
122        let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe {
123            storage.system_static_slice()
124        })?;
125
126        let plugins = {
127            let grammar = &mut basic_dict.grammar;
128            Plugins::load(cfg, grammar)?
129        };
130
131        if plugins.oov.is_empty() {
132            return Err(SudachiError::NoOOVPluginProvided);
133        }
134
135        for p in plugins.connect_cost.plugins() {
136            p.edit(&mut basic_dict.grammar);
137        }
138
139        let mut dic = JapaneseDictionary {
140            storage,
141            plugins,
142            _grammar: basic_dict.grammar,
143            _lexicon: basic_dict.lexicon_set,
144        };
145
146        // this Vec is needed to prevent double borrowing of dic
147        let user_dicts: Vec<_> = dic.storage.user_static_slice();
148        for udic in user_dicts {
149            dic = dic.merge_user_dictionary(udic)?;
150        }
151
152        Ok(dic)
153    }
154
155    /// Returns grammar with the correct lifetime
156    pub fn grammar(&self) -> &Grammar<'_> {
157        &self._grammar
158    }
159
160    /// Returns lexicon with the correct lifetime
161    pub fn lexicon(&self) -> &LexiconSet<'_> {
162        &self._lexicon
163    }
164
165    fn merge_user_dictionary(mut self, dictionary_bytes: &'static [u8]) -> SudachiResult<Self> {
166        let user_dict = DictionaryLoader::read_user_dictionary(dictionary_bytes)?;
167
168        // we need to update lexicon first, since it needs the current number of pos
169        let mut user_lexicon = user_dict.lexicon;
170        user_lexicon.update_cost(&self)?;
171
172        self._lexicon
173            .append(user_lexicon, self._grammar.pos_list.len())?;
174
175        if let Some(g) = user_dict.grammar {
176            self._grammar.merge(g);
177        }
178
179        Ok(self)
180    }
181}
182
183impl DictionaryAccess for JapaneseDictionary {
184    fn grammar(&self) -> &Grammar<'_> {
185        self.grammar()
186    }
187
188    fn lexicon(&self) -> &LexiconSet<'_> {
189        self.lexicon()
190    }
191
192    fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
193        self.plugins.input_text.plugins()
194    }
195
196    fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
197        self.plugins.oov.plugins()
198    }
199
200    fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
201        self.plugins.path_rewrite.plugins()
202    }
203}