1use std::path::Path;
18
19use crate::analysis::stateless_tokenizer::DictionaryAccess;
20use character_category::CharacterCategory;
21use grammar::Grammar;
22use header::Header;
23use lexicon::Lexicon;
24use lexicon_set::LexiconSet;
25
26use crate::plugin::input_text::InputTextPlugin;
27use crate::plugin::oov::OovProviderPlugin;
28use crate::plugin::path_rewrite::PathRewritePlugin;
29use crate::prelude::*;
30
31pub mod build;
32pub mod category_type;
33pub mod character_category;
34pub mod connect;
35pub mod dictionary;
36pub mod grammar;
37pub mod header;
38pub mod lexicon;
39pub mod lexicon_set;
40pub mod read;
41pub mod storage;
42pub mod subset;
43pub mod word_id;
44
45const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def");
46const POS_DEPTH: usize = 6;
47
48pub struct LoadedDictionary<'a> {
50 pub grammar: Grammar<'a>,
51 pub lexicon_set: LexiconSet<'a>,
52}
53
54impl<'a> LoadedDictionary<'a> {
55 pub fn from_system_dictionary_and_chardef(
57 dictionary_bytes: &'a [u8],
58 character_category: CharacterCategory,
59 ) -> SudachiResult<LoadedDictionary<'a>> {
60 let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?;
61
62 let mut grammar = system_dict
63 .grammar
64 .ok_or(SudachiError::InvalidDictionaryGrammar)?;
65 grammar.set_character_category(character_category);
66
67 let num_system_pos = grammar.pos_list.len();
68 Ok(LoadedDictionary {
69 grammar,
70 lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos),
71 })
72 }
73
74 pub fn from_system_dictionary(
76 dictionary_bytes: &'a [u8],
77 character_category_file: &Path,
78 ) -> SudachiResult<LoadedDictionary<'a>> {
79 let character_category = CharacterCategory::from_file(character_category_file)?;
80 Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)
81 }
82
83 pub fn from_system_dictionary_embedded(
85 dictionary_bytes: &'a [u8],
86 ) -> SudachiResult<LoadedDictionary<'a>> {
87 let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?;
88 Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)
89 }
90
91 #[cfg(test)]
92 pub(crate) fn merge_dictionary(
93 mut self,
94 other: DictionaryLoader<'a>,
95 ) -> SudachiResult<LoadedDictionary> {
96 let npos = self.grammar.pos_list.len();
97 let lexicon = other.lexicon;
98 let grammar = other.grammar;
99 self.lexicon_set.append(lexicon, npos)?;
100 if let Some(g) = grammar {
101 self.grammar.merge(g)
102 }
103 Ok(self)
104 }
105}
106
107impl<'a> DictionaryAccess for LoadedDictionary<'a> {
108 fn grammar(&self) -> &Grammar<'a> {
109 &self.grammar
110 }
111
112 fn lexicon(&self) -> &LexiconSet<'a> {
113 &self.lexicon_set
114 }
115
116 fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
117 &[]
118 }
119
120 fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
121 &[]
122 }
123
124 fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
125 &[]
126 }
127}
128
129pub struct DictionaryLoader<'a> {
131 pub header: Header,
132 pub grammar: Option<Grammar<'a>>,
133 pub lexicon: Lexicon<'a>,
134}
135
136impl<'a> DictionaryLoader<'a> {
137 pub unsafe fn read_any_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
142 let header = Header::parse(&dictionary_bytes[..Header::STORAGE_SIZE])?;
143 let mut offset = Header::STORAGE_SIZE;
144
145 let grammar = if header.has_grammar() {
146 let tmp = Grammar::parse(dictionary_bytes, offset)?;
147 offset += tmp.storage_size;
148 Some(tmp)
149 } else {
150 None
151 };
152
153 let lexicon = Lexicon::parse(dictionary_bytes, offset, header.has_synonym_group_ids())?;
154
155 Ok(DictionaryLoader {
156 header,
157 grammar,
158 lexicon,
159 })
160 }
161
162 pub fn read_system_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
166 let dict = unsafe { Self::read_any_dictionary(dictionary_bytes) }?;
167 match dict.header.version {
168 header::HeaderVersion::SystemDict(_) => Ok(dict),
169 _ => Err(SudachiError::InvalidHeader(
170 header::HeaderError::InvalidSystemDictVersion,
171 )),
172 }
173 }
174
175 pub fn read_user_dictionary(dictionary_bytes: &[u8]) -> SudachiResult<DictionaryLoader> {
179 let dict = unsafe { Self::read_any_dictionary(dictionary_bytes) }?;
180 match dict.header.version {
181 header::HeaderVersion::UserDict(_) => Ok(dict),
182 _ => Err(SudachiError::InvalidHeader(
183 header::HeaderError::InvalidSystemDictVersion,
184 )),
185 }
186 }
187
188 pub fn to_loaded(self) -> Option<LoadedDictionary<'a>> {
189 let mut lexicon = self.lexicon;
190 lexicon.set_dic_id(0);
191 match self.grammar {
192 None => None,
193 Some(grammar) => {
194 let num_system_pos = grammar.pos_list.len();
195 Some(LoadedDictionary {
196 grammar,
197 lexicon_set: LexiconSet::new(lexicon, num_system_pos),
198 })
199 }
200 }
201 }
202}