use std::path::Path;
use crate::analysis::stateless_tokenizer::DictionaryAccess;
use character_category::CharacterCategory;
use grammar::Grammar;
use header::Header;
use lexicon::Lexicon;
use lexicon_set::LexiconSet;
use crate::plugin::input_text::InputTextPlugin;
use crate::plugin::oov::OovProviderPlugin;
use crate::plugin::path_rewrite::PathRewritePlugin;
use crate::prelude::*;
pub mod build;
pub mod category_type;
pub mod character_category;
pub mod connect;
pub mod dictionary;
pub mod grammar;
pub mod header;
pub mod lexicon;
pub mod lexicon_set;
pub mod read;
pub mod storage;
pub mod subset;
pub mod word_id;
const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def");
const POS_DEPTH: usize = 6;
pub struct LoadedDictionary<'a> {
pub grammar: Grammar<'a>,
pub lexicon_set: LexiconSet<'a>,
}
impl<'a> LoadedDictionary<'a> {
pub fn from_system_dictionary_and_chardef(
dictionary_bytes: &'a [u8],
character_category: CharacterCategory,
) -> SudachiResult<LoadedDictionary<'a>> {
let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?;
let mut grammar = system_dict
.grammar
.ok_or(SudachiError::InvalidDictionaryGrammar)?;
grammar.set_character_category(character_category);
let num_system_pos = grammar.pos_list.len();
Ok(LoadedDictionary {
grammar,
lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos),
})
}
pub fn from_system_dictionary(
dictionary_bytes: &'a [u8],
character_category_file: &Path,
) -> SudachiResult<LoadedDictionary<'a>> {
let character_category = CharacterCategory::from_file(character_category_file)?;
Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)
}
pub fn from_system_dictionary_embedded(
dictionary_bytes: &'a [u8],
) -> SudachiResult<LoadedDictionary<'a>> {
let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?;
Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)
}
#[cfg(test)]
pub(crate) fn merge_dictionary(
mut self,
other: DictionaryLoader<'a>,
) -> SudachiResult<LoadedDictionary> {
let npos = self.grammar.pos_list.len();
let lexicon = other.lexicon;
let grammar = other.grammar;
self.lexicon_set.append(lexicon, npos)?;
if let Some(g) = grammar {
self.grammar.merge(g)
}
Ok(self)
}
}
impl<'a> DictionaryAccess for LoadedDictionary<'a> {
fn grammar(&self) -> &Grammar<'a> {
&self.grammar
}
fn lexicon(&self) -> &LexiconSet<'a> {
&self.lexicon_set
}
fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
&[]
}
fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
&[]
}
fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
&[]
}
}
pub struct DictionaryLoader<'a> {
pub header: Header,
pub grammar: Option<Grammar<'a>>,
pub lexicon: Lexicon<'a>,
}
impl<'a> DictionaryLoader<'a> {
pub unsafe fn read_any_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
let header = Header::parse(&dictionary_bytes[..Header::STORAGE_SIZE])?;
let mut offset = Header::STORAGE_SIZE;
let grammar = if header.has_grammar() {
let tmp = Grammar::parse(dictionary_bytes, offset)?;
offset += tmp.storage_size;
Some(tmp)
} else {
None
};
let lexicon = Lexicon::parse(dictionary_bytes, offset, header.has_synonym_group_ids())?;
Ok(DictionaryLoader {
header,
grammar,
lexicon,
})
}
pub fn read_system_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
let dict = unsafe { Self::read_any_dictionary(dictionary_bytes) }?;
match dict.header.version {
header::HeaderVersion::SystemDict(_) => Ok(dict),
_ => Err(SudachiError::InvalidHeader(
header::HeaderError::InvalidSystemDictVersion,
)),
}
}
pub fn read_user_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
let dict = unsafe { Self::read_any_dictionary(dictionary_bytes) }?;
match dict.header.version {
header::HeaderVersion::UserDict(_) => Ok(dict),
_ => Err(SudachiError::InvalidHeader(
header::HeaderError::InvalidSystemDictVersion,
)),
}
}
pub fn to_loaded(self) -> Option<LoadedDictionary<'a>> {
let mut lexicon = self.lexicon;
lexicon.set_dic_id(0);
match self.grammar {
None => None,
Some(grammar) => {
let num_system_pos = grammar.pos_list.len();
Some(LoadedDictionary {
grammar,
lexicon_set: LexiconSet::new(lexicon, num_system_pos),
})
}
}
}
}