1use std::io::Write;
18use std::path::Path;
19
20use crate::analysis::stateless_tokenizer::DictionaryAccess;
21use crate::dic::build::error::{BuildFailure, DicBuildError, DicCompilationCtx};
22use crate::dic::build::index::IndexBuilder;
23use crate::dic::build::lexicon::LexiconWriter;
24use crate::dic::build::report::{DictPartReport, ReportBuilder, Reporter};
25use crate::dic::build::resolve::{BinDictResolver, ChainedResolver, RawDictResolver};
26use crate::dic::grammar::Grammar;
27use crate::dic::header::{Header, HeaderVersion, SystemDictVersion, UserDictVersion};
28use crate::dic::lexicon_set::LexiconSet;
29use crate::dic::word_id::WordId;
30use crate::error::SudachiResult;
31use crate::plugin::input_text::InputTextPlugin;
32use crate::plugin::oov::OovProviderPlugin;
33use crate::plugin::path_rewrite::PathRewritePlugin;
34
35pub(crate) mod conn;
36pub mod error;
37pub(crate) mod index;
38pub(crate) mod lexicon;
39pub(crate) mod parse;
40pub(crate) mod primitives;
41pub mod report;
42mod resolve;
43#[cfg(test)]
44mod test;
45
46const MAX_POS_IDS: usize = i16::MAX as usize;
47const MAX_DIC_STRING_LEN: usize = MAX_POS_IDS;
48const MAX_ARRAY_LEN: usize = i8::MAX as usize;
49
50pub enum DataSource<'a> {
51 File(&'a Path),
52 Data(&'a [u8]),
53}
54
55pub trait AsDataSource<'a> {
56 fn convert(self) -> DataSource<'a>;
57 fn name(&self) -> String;
58}
59
60impl<'a> AsDataSource<'a> for DataSource<'a> {
61 fn convert(self) -> DataSource<'a> {
62 self
63 }
64
65 fn name(&self) -> String {
66 match self {
67 DataSource::File(p) => p.to_str().map(|s| s.to_owned()).unwrap_or_default(),
68 DataSource::Data(d) => format!("memory ({} bytes)", d.len()),
69 }
70 }
71}
72
73impl<'a> AsDataSource<'a> for &'a Path {
74 fn convert(self) -> DataSource<'a> {
75 DataSource::File(self)
76 }
77 fn name(&self) -> String {
78 self.to_str().map(|s| s.to_owned()).unwrap_or_default()
79 }
80}
81
82impl<'a> AsDataSource<'a> for &'a [u8] {
83 fn convert(self) -> DataSource<'a> {
84 DataSource::Data(self)
85 }
86 fn name(&self) -> String {
87 format!("memory ({} bytes)", self.len())
88 }
89}
90
91impl<'a, const N: usize> AsDataSource<'a> for &'a [u8; N] {
92 fn convert(self) -> DataSource<'a> {
93 DataSource::Data(&self[..])
94 }
95 fn name(&self) -> String {
96 format!("memory ({} bytes)", self.len())
97 }
98}
99
100pub enum NoDic {}
101
102impl DictionaryAccess for NoDic {
103 fn grammar(&self) -> &Grammar<'_> {
104 panic!("there is no grammar here")
105 }
106
107 fn lexicon(&self) -> &LexiconSet<'_> {
108 panic!("there is no lexicon here")
109 }
110
111 fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
112 &[]
113 }
114
115 fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
116 &[]
117 }
118
119 fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
120 &[]
121 }
122}
123
124pub struct DictBuilder<D> {
126 user: bool,
127 lexicon: lexicon::LexiconReader,
128 conn: conn::ConnBuffer,
129 ctx: DicCompilationCtx,
130 header: Header,
131 resolved: bool,
132 prebuilt: Option<D>,
133 reporter: Reporter,
134}
135
136impl DictBuilder<NoDic> {
137 pub fn new_system() -> Self {
139 Self::new_empty()
140 }
141}
142
143impl<D: DictionaryAccess> DictBuilder<D> {
144 fn new_empty() -> Self {
145 Self {
146 user: false,
147 lexicon: lexicon::LexiconReader::new(),
148 conn: conn::ConnBuffer::new(),
149 ctx: DicCompilationCtx::default(),
150 header: Header::new(),
151 resolved: false,
152 prebuilt: None,
153 reporter: Reporter::new(),
154 }
155 }
156
157 pub fn new_user(system: D) -> Self {
159 let mut bldr = Self::new_empty();
160 bldr.set_user(true);
161 bldr.lexicon.preload_pos(system.grammar());
162 let cm = system.grammar().conn_matrix();
163 bldr.lexicon
164 .set_max_conn_sizes(cm.num_left() as _, cm.num_right() as _);
165 bldr.lexicon
166 .set_num_system_words(system.lexicon().size() as usize);
167 bldr.prebuilt = Some(system);
168 bldr
169 }
170
171 pub fn set_compile_time<T: Into<std::time::SystemTime>>(
174 &mut self,
175 time: T,
176 ) -> std::time::SystemTime {
177 self.header.set_time(time.into())
178 }
179
180 pub fn set_description<T: Into<String>>(&mut self, description: T) {
182 self.header.description = description.into()
183 }
184
185 pub fn read_lexicon<'a, T: AsDataSource<'a> + 'a>(&mut self, data: T) -> SudachiResult<usize> {
187 let report = ReportBuilder::new(data.name()).read();
188 let result = match data.convert() {
189 DataSource::File(p) => self.lexicon.read_file(p),
190 DataSource::Data(d) => self.lexicon.read_bytes(d),
191 };
192 self.reporter.collect_r(result, report)
193 }
194
195 pub fn read_conn<'a, T: AsDataSource<'a> + 'a>(&mut self, data: T) -> SudachiResult<()> {
197 let report = ReportBuilder::new(data.name()).read();
198 match data.convert() {
199 DataSource::File(p) => self.conn.read_file(p),
200 DataSource::Data(d) => self.conn.read(d),
201 }?;
202 self.lexicon
203 .set_max_conn_sizes(self.conn.left(), self.conn.right());
204 self.reporter.collect(
205 self.conn.left() as usize * self.conn.right() as usize,
206 report,
207 );
208 Ok(())
209 }
210
211 pub fn compile<W: Write>(&mut self, w: &mut W) -> SudachiResult<()> {
213 self.check_if_resolved()?;
214 let report = ReportBuilder::new("validate").read();
215 self.lexicon.validate_entries()?;
216 self.reporter.collect(self.lexicon.entries().len(), report);
217 let mut written = self.header.write_to(w)?;
218 written += self.write_grammar(w)?;
219 self.write_lexicon(w, written)?;
220 Ok(())
221 }
222
223 pub fn resolve(&mut self) -> SudachiResult<usize> {
227 self.resolve_impl()
228 }
229
230 pub fn report(&self) -> &[DictPartReport] {
232 self.reporter.reports()
233 }
234}
235
236impl<D: DictionaryAccess> DictBuilder<D> {
238 fn set_user(&mut self, user: bool) {
239 if user {
240 self.header.version = HeaderVersion::UserDict(UserDictVersion::Version3)
241 } else {
242 self.header.version = HeaderVersion::SystemDict(SystemDictVersion::Version2)
243 }
244 self.user = user;
245 }
246
247 fn write_grammar<W: Write>(&mut self, w: &mut W) -> SudachiResult<usize> {
248 let mut size = 0;
249 let r1 = ReportBuilder::new("pos_table");
250 size += self.lexicon.write_pos_table(w)?;
251 self.reporter.collect(size, r1);
252 let r2 = ReportBuilder::new("conn_matrix");
253 size += self.conn.write_to(w)?;
254 self.reporter.collect(size, r2);
255 Ok(size)
256 }
257
258 fn write_index<W: Write>(&mut self, w: &mut W) -> SudachiResult<usize> {
259 let mut size = 0;
260 let mut index = IndexBuilder::new();
261 for (i, e) in self.lexicon.entries().iter().enumerate() {
262 if e.should_index() {
263 let wid = WordId::checked(0, i as u32)?;
264 index.add(e.surface(), wid);
265 }
266 }
267
268 let report = ReportBuilder::new("trie");
269 let word_id_table = index.build_word_id_table()?;
270 let trie = index.build_trie()?;
271
272 let trie_size = trie.len() / 4;
273 w.write_all(&(trie_size as u32).to_le_bytes())?;
274 size += 4;
275 w.write_all(&trie)?;
276 size += trie.len();
277 std::mem::drop(trie); self.reporter.collect(size, report);
279 let cur_size = size;
280
281 let report = ReportBuilder::new("word_id table");
282 w.write_all(&(word_id_table.len() as u32).to_le_bytes())?;
283 size += 4;
284 w.write_all(&word_id_table)?;
285 size += word_id_table.len();
286 self.reporter.collect(size - cur_size, report);
287
288 Ok(size)
289 }
290
291 fn write_lexicon<W: Write>(&mut self, w: &mut W, offset: usize) -> SudachiResult<usize> {
292 let mut size = self.write_index(w)?;
293 let mut writer =
294 LexiconWriter::new(self.lexicon.entries(), offset + size, &mut self.reporter);
295 size += writer.write(w)?;
296 Ok(size)
297 }
298
299 fn check_if_resolved(&self) -> SudachiResult<()> {
300 if self.lexicon.needs_split_resolution() && !self.resolved {
301 return self.ctx.err(BuildFailure::UnresolvedSplits);
302 }
303
304 Ok(())
305 }
306
307 fn unsafe_make_resolver<'a>(&self) -> RawDictResolver<'a> {
309 let resolver = RawDictResolver::new(self.lexicon.entries(), self.user);
310 unsafe { std::mem::transmute(resolver) }
313 }
314
315 fn resolve_impl(&mut self) -> SudachiResult<usize> {
316 if !self.lexicon.needs_split_resolution() {
317 self.resolved = true;
318 return Ok(0);
319 }
320
321 let this_resolver = self.unsafe_make_resolver();
322 let report = ReportBuilder::new("resolve");
323
324 let cnt = match self.prebuilt.as_ref() {
325 Some(d) => {
326 let built_resolver = BinDictResolver::new(d)?;
327 let chained = ChainedResolver::new(this_resolver, built_resolver);
328 self.lexicon.resolve_splits(&chained)
329 }
330 None => self.lexicon.resolve_splits(&this_resolver),
331 };
332 let cnt = self.reporter.collect_r(cnt, report);
333 match cnt {
334 Ok(cnt) => {
335 self.resolved = true;
336 Ok(cnt)
337 }
338 Err((split_info, line)) => Err(DicBuildError {
339 file: "<entries>".to_owned(),
340 line,
341 cause: BuildFailure::InvalidSplitWordReference(split_info),
342 }
343 .into()),
344 }
345 }
346}