sudachi/dic/build/
index.rs1use crate::dic::build::error::{BuildFailure, DicBuildError};
18use crate::dic::build::primitives::write_u32_array;
19use crate::dic::word_id::WordId;
20use crate::error::{SudachiError, SudachiResult};
21use crate::util::fxhash::FxBuildHasher;
22use indexmap::map::IndexMap;
23
24pub struct IndexEntry {
25 ids: Vec<WordId>,
26 offset: usize,
27}
28
29impl Default for IndexEntry {
30 fn default() -> Self {
31 Self {
32 ids: Vec::new(),
33 offset: usize::MAX,
34 }
35 }
36}
37
38pub struct IndexBuilder<'a> {
39 data: IndexMap<&'a str, IndexEntry, FxBuildHasher>,
42}
43
44impl<'a> IndexBuilder<'a> {
45 pub fn new() -> Self {
46 Self {
47 data: IndexMap::default(),
48 }
49 }
50
51 pub fn add(&mut self, key: &'a str, id: WordId) {
52 self.data.entry(key).or_default().ids.push(id)
53 }
54
55 pub fn build_word_id_table(&mut self) -> SudachiResult<Vec<u8>> {
56 let mut result = Vec::with_capacity(self.data.len() * 13);
58 for (k, entry) in self.data.iter_mut() {
59 entry.offset = result.len();
60 let ids = std::mem::take(&mut entry.ids);
62 write_u32_array(&mut result, &ids).map_err(|e| {
63 SudachiError::DictionaryCompilationError(DicBuildError {
64 cause: e,
65 line: 0,
66 file: format!("<word id table for `{}` has too much entries>", k),
67 })
68 })?;
69 }
70 Ok(result)
71 }
72
73 pub fn build_trie(&mut self) -> SudachiResult<Vec<u8>> {
74 let mut trie_entries: Vec<(&str, u32)> = Vec::new();
75 for (k, v) in self.data.drain(..) {
76 if v.offset > u32::MAX as _ {
77 return Err(DicBuildError {
78 file: format!("entry {}", k),
79 line: 0,
80 cause: BuildFailure::WordIdTableNotBuilt,
81 }
82 .into());
83 }
84 trie_entries.push((k, v.offset as u32));
85 }
86 self.data.shrink_to_fit();
87 trie_entries.sort_by(|(a, _), (b, _)| a.cmp(b));
88
89 let trie = yada::builder::DoubleArrayBuilder::build(&trie_entries);
90 match trie {
91 Some(t) => Ok(t),
92 None => Err(DicBuildError {
93 file: "<trie>".to_owned(),
94 line: 0,
95 cause: BuildFailure::TrieBuildFailure,
96 }
97 .into()),
98 }
99 }
100}
101
102#[cfg(test)]
103mod test {
104 use super::*;
105 use crate::dic::lexicon::trie::{Trie, TrieEntry};
106 use std::convert::TryInto;
107
108 fn make_trie(data: Vec<u8>) -> Trie<'static> {
109 let mut elems: Vec<u32> = Vec::with_capacity(data.len() / 4);
110 for i in (0..data.len()).step_by(4) {
111 let arr: [u8; 4] = data[i..i + 4].try_into().unwrap();
112 elems.push(u32::from_le_bytes(arr))
113 }
114 Trie::new_owned(elems)
115 }
116
117 #[test]
118 fn build_index_1() {
119 let mut bldr = IndexBuilder::new();
120 bldr.add("test", WordId::new(0, 0));
121 let _ = bldr.build_word_id_table().unwrap();
122
123 let trie = make_trie(bldr.build_trie().unwrap());
124 let mut iter = trie.common_prefix_iterator(b"test", 0);
125 assert_eq!(iter.next(), Some(TrieEntry { value: 0, end: 4 }));
126 assert_eq!(iter.next(), None);
127 }
128
129 #[test]
130 fn build_index_2() {
131 let mut bldr = IndexBuilder::new();
132 bldr.add("test", WordId::new(0, 0));
133 bldr.add("tes", WordId::new(0, 1));
134 let _ = bldr.build_word_id_table().unwrap();
135
136 let trie = make_trie(bldr.build_trie().unwrap());
137 let mut iter = trie.common_prefix_iterator(b"test", 0);
138 assert_eq!(iter.next(), Some(TrieEntry { value: 5, end: 3 }));
139 assert_eq!(iter.next(), Some(TrieEntry { value: 0, end: 4 }));
140 assert_eq!(iter.next(), None);
141 }
142}