sudachi/dic/build/
mod.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use std::io::Write;
18use std::path::Path;
19
20use crate::analysis::stateless_tokenizer::DictionaryAccess;
21use crate::dic::build::error::{BuildFailure, DicBuildError, DicCompilationCtx};
22use crate::dic::build::index::IndexBuilder;
23use crate::dic::build::lexicon::LexiconWriter;
24use crate::dic::build::report::{DictPartReport, ReportBuilder, Reporter};
25use crate::dic::build::resolve::{BinDictResolver, ChainedResolver, RawDictResolver};
26use crate::dic::grammar::Grammar;
27use crate::dic::header::{Header, HeaderVersion, SystemDictVersion, UserDictVersion};
28use crate::dic::lexicon_set::LexiconSet;
29use crate::dic::word_id::WordId;
30use crate::error::SudachiResult;
31use crate::plugin::input_text::InputTextPlugin;
32use crate::plugin::oov::OovProviderPlugin;
33use crate::plugin::path_rewrite::PathRewritePlugin;
34
35pub(crate) mod conn;
36pub mod error;
37pub(crate) mod index;
38pub(crate) mod lexicon;
39pub(crate) mod parse;
40pub(crate) mod primitives;
41pub mod report;
42mod resolve;
43#[cfg(test)]
44mod test;
45
46const MAX_POS_IDS: usize = i16::MAX as usize;
47const MAX_DIC_STRING_LEN: usize = MAX_POS_IDS;
48const MAX_ARRAY_LEN: usize = i8::MAX as usize;
49
50pub enum DataSource<'a> {
51    File(&'a Path),
52    Data(&'a [u8]),
53}
54
55pub trait AsDataSource<'a> {
56    fn convert(self) -> DataSource<'a>;
57    fn name(&self) -> String;
58}
59
60impl<'a> AsDataSource<'a> for DataSource<'a> {
61    fn convert(self) -> DataSource<'a> {
62        self
63    }
64
65    fn name(&self) -> String {
66        match self {
67            DataSource::File(p) => p.to_str().map(|s| s.to_owned()).unwrap_or_default(),
68            DataSource::Data(d) => format!("memory ({} bytes)", d.len()),
69        }
70    }
71}
72
73impl<'a> AsDataSource<'a> for &'a Path {
74    fn convert(self) -> DataSource<'a> {
75        DataSource::File(self)
76    }
77    fn name(&self) -> String {
78        self.to_str().map(|s| s.to_owned()).unwrap_or_default()
79    }
80}
81
82impl<'a> AsDataSource<'a> for &'a [u8] {
83    fn convert(self) -> DataSource<'a> {
84        DataSource::Data(self)
85    }
86    fn name(&self) -> String {
87        format!("memory ({} bytes)", self.len())
88    }
89}
90
91impl<'a, const N: usize> AsDataSource<'a> for &'a [u8; N] {
92    fn convert(self) -> DataSource<'a> {
93        DataSource::Data(&self[..])
94    }
95    fn name(&self) -> String {
96        format!("memory ({} bytes)", self.len())
97    }
98}
99
100pub enum NoDic {}
101
102impl DictionaryAccess for NoDic {
103    fn grammar(&self) -> &Grammar<'_> {
104        panic!("there is no grammar here")
105    }
106
107    fn lexicon(&self) -> &LexiconSet<'_> {
108        panic!("there is no lexicon here")
109    }
110
111    fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
112        &[]
113    }
114
115    fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
116        &[]
117    }
118
119    fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
120        &[]
121    }
122}
123
124/// Builds a binary dictionary from csv lexicon and connection matrix (optional)
125pub struct DictBuilder<D> {
126    user: bool,
127    lexicon: lexicon::LexiconReader,
128    conn: conn::ConnBuffer,
129    ctx: DicCompilationCtx,
130    header: Header,
131    resolved: bool,
132    prebuilt: Option<D>,
133    reporter: Reporter,
134}
135
136impl DictBuilder<NoDic> {
137    /// Creates a new builder for system dictionary
138    pub fn new_system() -> Self {
139        Self::new_empty()
140    }
141}
142
143impl<D: DictionaryAccess> DictBuilder<D> {
144    fn new_empty() -> Self {
145        Self {
146            user: false,
147            lexicon: lexicon::LexiconReader::new(),
148            conn: conn::ConnBuffer::new(),
149            ctx: DicCompilationCtx::default(),
150            header: Header::new(),
151            resolved: false,
152            prebuilt: None,
153            reporter: Reporter::new(),
154        }
155    }
156
157    /// Creates a new builder for user dictionary
158    pub fn new_user(system: D) -> Self {
159        let mut bldr = Self::new_empty();
160        bldr.set_user(true);
161        bldr.lexicon.preload_pos(system.grammar());
162        let cm = system.grammar().conn_matrix();
163        bldr.lexicon
164            .set_max_conn_sizes(cm.num_left() as _, cm.num_right() as _);
165        bldr.lexicon
166            .set_num_system_words(system.lexicon().size() as usize);
167        bldr.prebuilt = Some(system);
168        bldr
169    }
170
171    /// Set the dictionary compile time to the specified time
172    /// instead of current time
173    pub fn set_compile_time<T: Into<std::time::SystemTime>>(
174        &mut self,
175        time: T,
176    ) -> std::time::SystemTime {
177        self.header.set_time(time.into())
178    }
179
180    /// Set the dictionary description
181    pub fn set_description<T: Into<String>>(&mut self, description: T) {
182        self.header.description = description.into()
183    }
184
185    /// Read the csv lexicon from either a file or an in-memory buffer
186    pub fn read_lexicon<'a, T: AsDataSource<'a> + 'a>(&mut self, data: T) -> SudachiResult<usize> {
187        let report = ReportBuilder::new(data.name()).read();
188        let result = match data.convert() {
189            DataSource::File(p) => self.lexicon.read_file(p),
190            DataSource::Data(d) => self.lexicon.read_bytes(d),
191        };
192        self.reporter.collect_r(result, report)
193    }
194
195    /// Read the connection matrix from either a file or an in-memory buffer
196    pub fn read_conn<'a, T: AsDataSource<'a> + 'a>(&mut self, data: T) -> SudachiResult<()> {
197        let report = ReportBuilder::new(data.name()).read();
198        match data.convert() {
199            DataSource::File(p) => self.conn.read_file(p),
200            DataSource::Data(d) => self.conn.read(d),
201        }?;
202        self.lexicon
203            .set_max_conn_sizes(self.conn.left(), self.conn.right());
204        self.reporter.collect(
205            self.conn.left() as usize * self.conn.right() as usize,
206            report,
207        );
208        Ok(())
209    }
210
211    /// Compile the binary dictionary and write it to the specified sink
212    pub fn compile<W: Write>(&mut self, w: &mut W) -> SudachiResult<()> {
213        self.check_if_resolved()?;
214        let report = ReportBuilder::new("validate").read();
215        self.lexicon.validate_entries()?;
216        self.reporter.collect(self.lexicon.entries().len(), report);
217        let mut written = self.header.write_to(w)?;
218        written += self.write_grammar(w)?;
219        self.write_lexicon(w, written)?;
220        Ok(())
221    }
222
223    /// Resolve the dictionary references.
224    ///
225    /// Returns the number of resolved entries
226    pub fn resolve(&mut self) -> SudachiResult<usize> {
227        self.resolve_impl()
228    }
229
230    /// Return dictionary build report
231    pub fn report(&self) -> &[DictPartReport] {
232        self.reporter.reports()
233    }
234}
235
236// private functions
237impl<D: DictionaryAccess> DictBuilder<D> {
238    fn set_user(&mut self, user: bool) {
239        if user {
240            self.header.version = HeaderVersion::UserDict(UserDictVersion::Version3)
241        } else {
242            self.header.version = HeaderVersion::SystemDict(SystemDictVersion::Version2)
243        }
244        self.user = user;
245    }
246
247    fn write_grammar<W: Write>(&mut self, w: &mut W) -> SudachiResult<usize> {
248        let mut size = 0;
249        let r1 = ReportBuilder::new("pos_table");
250        size += self.lexicon.write_pos_table(w)?;
251        self.reporter.collect(size, r1);
252        let r2 = ReportBuilder::new("conn_matrix");
253        size += self.conn.write_to(w)?;
254        self.reporter.collect(size, r2);
255        Ok(size)
256    }
257
258    fn write_index<W: Write>(&mut self, w: &mut W) -> SudachiResult<usize> {
259        let mut size = 0;
260        let mut index = IndexBuilder::new();
261        for (i, e) in self.lexicon.entries().iter().enumerate() {
262            if e.should_index() {
263                let wid = WordId::checked(0, i as u32)?;
264                index.add(e.surface(), wid);
265            }
266        }
267
268        let report = ReportBuilder::new("trie");
269        let word_id_table = index.build_word_id_table()?;
270        let trie = index.build_trie()?;
271
272        let trie_size = trie.len() / 4;
273        w.write_all(&(trie_size as u32).to_le_bytes())?;
274        size += 4;
275        w.write_all(&trie)?;
276        size += trie.len();
277        std::mem::drop(trie); //can be big, so drop explicitly
278        self.reporter.collect(size, report);
279        let cur_size = size;
280
281        let report = ReportBuilder::new("word_id table");
282        w.write_all(&(word_id_table.len() as u32).to_le_bytes())?;
283        size += 4;
284        w.write_all(&word_id_table)?;
285        size += word_id_table.len();
286        self.reporter.collect(size - cur_size, report);
287
288        Ok(size)
289    }
290
291    fn write_lexicon<W: Write>(&mut self, w: &mut W, offset: usize) -> SudachiResult<usize> {
292        let mut size = self.write_index(w)?;
293        let mut writer =
294            LexiconWriter::new(self.lexicon.entries(), offset + size, &mut self.reporter);
295        size += writer.write(w)?;
296        Ok(size)
297    }
298
299    fn check_if_resolved(&self) -> SudachiResult<()> {
300        if self.lexicon.needs_split_resolution() && !self.resolved {
301            return self.ctx.err(BuildFailure::UnresolvedSplits);
302        }
303
304        Ok(())
305    }
306
307    /// this function must only be used in resolve_impl
308    fn unsafe_make_resolver<'a>(&self) -> RawDictResolver<'a> {
309        let resolver = RawDictResolver::new(self.lexicon.entries(), self.user);
310        // resolver borrows parts of entries, but it does not touch splits
311        // resolve function only modifies splits
312        unsafe { std::mem::transmute(resolver) }
313    }
314
315    fn resolve_impl(&mut self) -> SudachiResult<usize> {
316        if !self.lexicon.needs_split_resolution() {
317            self.resolved = true;
318            return Ok(0);
319        }
320
321        let this_resolver = self.unsafe_make_resolver();
322        let report = ReportBuilder::new("resolve");
323
324        let cnt = match self.prebuilt.as_ref() {
325            Some(d) => {
326                let built_resolver = BinDictResolver::new(d)?;
327                let chained = ChainedResolver::new(this_resolver, built_resolver);
328                self.lexicon.resolve_splits(&chained)
329            }
330            None => self.lexicon.resolve_splits(&this_resolver),
331        };
332        let cnt = self.reporter.collect_r(cnt, report);
333        match cnt {
334            Ok(cnt) => {
335                self.resolved = true;
336                Ok(cnt)
337            }
338            Err((split_info, line)) => Err(DicBuildError {
339                file: "<entries>".to_owned(),
340                line,
341                cause: BuildFailure::InvalidSplitWordReference(split_info),
342            }
343            .into()),
344        }
345    }
346}