sudachi/dic/
dictionary.rsuse std::fs::File;
use std::path::Path;
use memmap2::Mmap;
use crate::analysis::stateless_tokenizer::DictionaryAccess;
use crate::config::Config;
use crate::dic::grammar::Grammar;
use crate::dic::lexicon_set::LexiconSet;
use crate::dic::storage::{Storage, SudachiDicData};
use crate::dic::{DictionaryLoader, LoadedDictionary};
use crate::error::{SudachiError, SudachiResult};
use crate::plugin::input_text::InputTextPlugin;
use crate::plugin::oov::OovProviderPlugin;
use crate::plugin::path_rewrite::PathRewritePlugin;
use crate::plugin::Plugins;
pub struct JapaneseDictionary {
storage: SudachiDicData,
plugins: Plugins,
_grammar: Grammar<'static>,
_lexicon: LexiconSet<'static>,
}
fn map_file(path: &Path) -> SudachiResult<Storage> {
let file = File::open(path)?;
let mapping = unsafe { Mmap::map(&file) }?;
Ok(Storage::File(mapping))
}
fn load_system_dic(cfg: &Config) -> SudachiResult<Storage> {
let p = cfg.resolved_system_dict()?;
map_file(&p).map_err(|e| e.with_context(p.as_os_str().to_string_lossy()))
}
impl JapaneseDictionary {
pub fn from_cfg(cfg: &Config) -> SudachiResult<JapaneseDictionary> {
let mut sb = SudachiDicData::new(load_system_dic(cfg)?);
for udic in cfg.resolved_user_dicts()? {
sb.add_user(
map_file(&udic).map_err(|e| e.with_context(udic.as_os_str().to_string_lossy()))?,
)
}
Self::from_cfg_storage(cfg, sb)
}
pub fn from_cfg_storage(
cfg: &Config,
storage: SudachiDicData,
) -> SudachiResult<JapaneseDictionary> {
let mut basic_dict = LoadedDictionary::from_system_dictionary(
unsafe { storage.system_static_slice() },
cfg.complete_path(&cfg.character_definition_file)?.as_path(),
)?;
let plugins = {
let grammar = &mut basic_dict.grammar;
Plugins::load(cfg, grammar)?
};
if plugins.oov.is_empty() {
return Err(SudachiError::NoOOVPluginProvided);
}
for p in plugins.connect_cost.plugins() {
p.edit(&mut basic_dict.grammar);
}
let mut dic = JapaneseDictionary {
storage,
plugins,
_grammar: basic_dict.grammar,
_lexicon: basic_dict.lexicon_set,
};
let user_dicts: Vec<_> = dic.storage.user_static_slice();
for udic in user_dicts {
dic = dic.merge_user_dictionary(udic)?;
}
Ok(dic)
}
pub fn from_cfg_storage_with_embedded_chardef(
cfg: &Config,
storage: SudachiDicData,
) -> SudachiResult<JapaneseDictionary> {
let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe {
storage.system_static_slice()
})?;
let plugins = {
let grammar = &mut basic_dict.grammar;
Plugins::load(cfg, grammar)?
};
if plugins.oov.is_empty() {
return Err(SudachiError::NoOOVPluginProvided);
}
for p in plugins.connect_cost.plugins() {
p.edit(&mut basic_dict.grammar);
}
let mut dic = JapaneseDictionary {
storage,
plugins,
_grammar: basic_dict.grammar,
_lexicon: basic_dict.lexicon_set,
};
let user_dicts: Vec<_> = dic.storage.user_static_slice();
for udic in user_dicts {
dic = dic.merge_user_dictionary(udic)?;
}
Ok(dic)
}
pub fn grammar(&self) -> &Grammar<'_> {
&self._grammar
}
pub fn lexicon(&self) -> &LexiconSet<'_> {
&self._lexicon
}
fn merge_user_dictionary(mut self, dictionary_bytes: &'static [u8]) -> SudachiResult<Self> {
let user_dict = DictionaryLoader::read_user_dictionary(dictionary_bytes)?;
let mut user_lexicon = user_dict.lexicon;
user_lexicon.update_cost(&self)?;
self._lexicon
.append(user_lexicon, self._grammar.pos_list.len())?;
if let Some(g) = user_dict.grammar {
self._grammar.merge(g);
}
Ok(self)
}
}
impl DictionaryAccess for JapaneseDictionary {
fn grammar(&self) -> &Grammar<'_> {
self.grammar()
}
fn lexicon(&self) -> &LexiconSet<'_> {
self.lexicon()
}
fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
self.plugins.input_text.plugins()
}
fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
self.plugins.oov.plugins()
}
fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
self.plugins.path_rewrite.plugins()
}
}