sudachi/dic/build/
index.rs

1/*
2 *  Copyright (c) 2021 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::dic::build::error::{BuildFailure, DicBuildError};
18use crate::dic::build::primitives::write_u32_array;
19use crate::dic::word_id::WordId;
20use crate::error::{SudachiError, SudachiResult};
21use crate::util::fxhash::FxBuildHasher;
22use indexmap::map::IndexMap;
23
24pub struct IndexEntry {
25    ids: Vec<WordId>,
26    offset: usize,
27}
28
29impl Default for IndexEntry {
30    fn default() -> Self {
31        Self {
32            ids: Vec::new(),
33            offset: usize::MAX,
34        }
35    }
36}
37
38pub struct IndexBuilder<'a> {
39    // Insertion order matters for the built dictionary,
40    // so using IndexMap here instead of a simple HashMap
41    data: IndexMap<&'a str, IndexEntry, FxBuildHasher>,
42}
43
44impl<'a> IndexBuilder<'a> {
45    pub fn new() -> Self {
46        Self {
47            data: IndexMap::default(),
48        }
49    }
50
51    pub fn add(&mut self, key: &'a str, id: WordId) {
52        self.data.entry(key).or_default().ids.push(id)
53    }
54
55    pub fn build_word_id_table(&mut self) -> SudachiResult<Vec<u8>> {
56        // by default assume that there will be 3 entries on average
57        let mut result = Vec::with_capacity(self.data.len() * 13);
58        for (k, entry) in self.data.iter_mut() {
59            entry.offset = result.len();
60            // clear stored ids memory after use
61            let ids = std::mem::take(&mut entry.ids);
62            write_u32_array(&mut result, &ids).map_err(|e| {
63                SudachiError::DictionaryCompilationError(DicBuildError {
64                    cause: e,
65                    line: 0,
66                    file: format!("<word id table for `{}` has too much entries>", k),
67                })
68            })?;
69        }
70        Ok(result)
71    }
72
73    pub fn build_trie(&mut self) -> SudachiResult<Vec<u8>> {
74        let mut trie_entries: Vec<(&str, u32)> = Vec::new();
75        for (k, v) in self.data.drain(..) {
76            if v.offset > u32::MAX as _ {
77                return Err(DicBuildError {
78                    file: format!("entry {}", k),
79                    line: 0,
80                    cause: BuildFailure::WordIdTableNotBuilt,
81                }
82                .into());
83            }
84            trie_entries.push((k, v.offset as u32));
85        }
86        self.data.shrink_to_fit();
87        trie_entries.sort_by(|(a, _), (b, _)| a.cmp(b));
88
89        let trie = yada::builder::DoubleArrayBuilder::build(&trie_entries);
90        match trie {
91            Some(t) => Ok(t),
92            None => Err(DicBuildError {
93                file: "<trie>".to_owned(),
94                line: 0,
95                cause: BuildFailure::TrieBuildFailure,
96            }
97            .into()),
98        }
99    }
100}
101
102#[cfg(test)]
103mod test {
104    use super::*;
105    use crate::dic::lexicon::trie::{Trie, TrieEntry};
106    use std::convert::TryInto;
107
108    fn make_trie(data: Vec<u8>) -> Trie<'static> {
109        let mut elems: Vec<u32> = Vec::with_capacity(data.len() / 4);
110        for i in (0..data.len()).step_by(4) {
111            let arr: [u8; 4] = data[i..i + 4].try_into().unwrap();
112            elems.push(u32::from_le_bytes(arr))
113        }
114        Trie::new_owned(elems)
115    }
116
117    #[test]
118    fn build_index_1() {
119        let mut bldr = IndexBuilder::new();
120        bldr.add("test", WordId::new(0, 0));
121        let _ = bldr.build_word_id_table().unwrap();
122
123        let trie = make_trie(bldr.build_trie().unwrap());
124        let mut iter = trie.common_prefix_iterator(b"test", 0);
125        assert_eq!(iter.next(), Some(TrieEntry { value: 0, end: 4 }));
126        assert_eq!(iter.next(), None);
127    }
128
129    #[test]
130    fn build_index_2() {
131        let mut bldr = IndexBuilder::new();
132        bldr.add("test", WordId::new(0, 0));
133        bldr.add("tes", WordId::new(0, 1));
134        let _ = bldr.build_word_id_table().unwrap();
135
136        let trie = make_trie(bldr.build_trie().unwrap());
137        let mut iter = trie.common_prefix_iterator(b"test", 0);
138        assert_eq!(iter.next(), Some(TrieEntry { value: 5, end: 3 }));
139        assert_eq!(iter.next(), Some(TrieEntry { value: 0, end: 4 }));
140        assert_eq!(iter.next(), None);
141    }
142}