sudachi/dic/
dictionary.rs1use std::fs::File;
18use std::path::Path;
19
20use memmap2::Mmap;
21
22use crate::analysis::stateless_tokenizer::DictionaryAccess;
23use crate::config::Config;
24use crate::dic::grammar::Grammar;
25use crate::dic::lexicon_set::LexiconSet;
26use crate::dic::storage::{Storage, SudachiDicData};
27use crate::dic::{DictionaryLoader, LoadedDictionary};
28use crate::error::{SudachiError, SudachiResult};
29use crate::plugin::input_text::InputTextPlugin;
30use crate::plugin::oov::OovProviderPlugin;
31use crate::plugin::path_rewrite::PathRewritePlugin;
32use crate::plugin::Plugins;
33
34pub struct JapaneseDictionary {
44    storage: SudachiDicData,
45    plugins: Plugins,
46    _grammar: Grammar<'static>,
48    _lexicon: LexiconSet<'static>,
50}
51
52fn map_file(path: &Path) -> SudachiResult<Storage> {
53    let file = File::open(path)?;
54    let mapping = unsafe { Mmap::map(&file) }?;
55    Ok(Storage::File(mapping))
56}
57
58fn load_system_dic(cfg: &Config) -> SudachiResult<Storage> {
59    let p = cfg.resolved_system_dict()?;
60    map_file(&p).map_err(|e| e.with_context(p.as_os_str().to_string_lossy()))
61}
62
63impl JapaneseDictionary {
64    pub fn from_cfg(cfg: &Config) -> SudachiResult<JapaneseDictionary> {
67        let mut sb = SudachiDicData::new(load_system_dic(cfg)?);
68
69        for udic in cfg.resolved_user_dicts()? {
70            sb.add_user(
71                map_file(&udic).map_err(|e| e.with_context(udic.as_os_str().to_string_lossy()))?,
72            )
73        }
74
75        Self::from_cfg_storage(cfg, sb)
76    }
77
78    pub fn from_cfg_storage(
80        cfg: &Config,
81        storage: SudachiDicData,
82    ) -> SudachiResult<JapaneseDictionary> {
83        let mut basic_dict = LoadedDictionary::from_system_dictionary(
84            unsafe { storage.system_static_slice() },
85            cfg.complete_path(&cfg.character_definition_file)?.as_path(),
86        )?;
87
88        let plugins = {
89            let grammar = &mut basic_dict.grammar;
90            Plugins::load(cfg, grammar)?
91        };
92
93        if plugins.oov.is_empty() {
94            return Err(SudachiError::NoOOVPluginProvided);
95        }
96
97        for p in plugins.connect_cost.plugins() {
98            p.edit(&mut basic_dict.grammar);
99        }
100
101        let mut dic = JapaneseDictionary {
102            storage,
103            plugins,
104            _grammar: basic_dict.grammar,
105            _lexicon: basic_dict.lexicon_set,
106        };
107
108        let user_dicts: Vec<_> = dic.storage.user_static_slice();
110        for udic in user_dicts {
111            dic = dic.merge_user_dictionary(udic)?;
112        }
113
114        Ok(dic)
115    }
116
117    pub fn from_cfg_storage_with_embedded_chardef(
119        cfg: &Config,
120        storage: SudachiDicData,
121    ) -> SudachiResult<JapaneseDictionary> {
122        let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe {
123            storage.system_static_slice()
124        })?;
125
126        let plugins = {
127            let grammar = &mut basic_dict.grammar;
128            Plugins::load(cfg, grammar)?
129        };
130
131        if plugins.oov.is_empty() {
132            return Err(SudachiError::NoOOVPluginProvided);
133        }
134
135        for p in plugins.connect_cost.plugins() {
136            p.edit(&mut basic_dict.grammar);
137        }
138
139        let mut dic = JapaneseDictionary {
140            storage,
141            plugins,
142            _grammar: basic_dict.grammar,
143            _lexicon: basic_dict.lexicon_set,
144        };
145
146        let user_dicts: Vec<_> = dic.storage.user_static_slice();
148        for udic in user_dicts {
149            dic = dic.merge_user_dictionary(udic)?;
150        }
151
152        Ok(dic)
153    }
154
155    pub fn grammar(&self) -> &Grammar<'_> {
157        &self._grammar
158    }
159
160    pub fn lexicon(&self) -> &LexiconSet<'_> {
162        &self._lexicon
163    }
164
165    fn merge_user_dictionary(mut self, dictionary_bytes: &'static [u8]) -> SudachiResult<Self> {
166        let user_dict = DictionaryLoader::read_user_dictionary(dictionary_bytes)?;
167
168        let mut user_lexicon = user_dict.lexicon;
170        user_lexicon.update_cost(&self)?;
171
172        self._lexicon
173            .append(user_lexicon, self._grammar.pos_list.len())?;
174
175        if let Some(g) = user_dict.grammar {
176            self._grammar.merge(g);
177        }
178
179        Ok(self)
180    }
181}
182
183impl DictionaryAccess for JapaneseDictionary {
184    fn grammar(&self) -> &Grammar<'_> {
185        self.grammar()
186    }
187
188    fn lexicon(&self) -> &LexiconSet<'_> {
189        self.lexicon()
190    }
191
192    fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
193        self.plugins.input_text.plugins()
194    }
195
196    fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
197        self.plugins.oov.plugins()
198    }
199
200    fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
201        self.plugins.path_rewrite.plugins()
202    }
203}