sudachi/dic/lexicon/
mod.rs1use std::cmp;
18use std::mem::size_of;
19
20use crate::analysis::stateful_tokenizer::StatefulTokenizer;
21use crate::analysis::stateless_tokenizer::DictionaryAccess;
22use crate::dic::subset::InfoSubset;
23use crate::dic::word_id::WordId;
24use nom::{bytes::complete::take, number::complete::le_u32};
25
26use crate::error::SudachiNomResult;
27use crate::prelude::*;
28
29use self::trie::Trie;
30use self::word_id_table::WordIdTable;
31use self::word_infos::{WordInfo, WordInfos};
32use self::word_params::WordParams;
33
34pub mod trie;
35pub mod word_id_table;
36pub mod word_infos;
37pub mod word_params;
38
39pub const MAX_DICTIONARIES: usize = 15;
43
44pub struct Lexicon<'a> {
48 trie: Trie<'a>,
49 word_id_table: WordIdTable<'a>,
50 word_params: WordParams<'a>,
51 word_infos: WordInfos<'a>,
52 lex_id: u8,
53}
54
55#[derive(Eq, PartialEq, Debug)]
57pub struct LexiconEntry {
58 pub word_id: WordId,
60 pub end: usize,
62}
63
64impl LexiconEntry {
65 pub fn new(word_id: WordId, end: usize) -> LexiconEntry {
66 LexiconEntry { word_id, end }
67 }
68}
69
70impl<'a> Lexicon<'a> {
71 const USER_DICT_COST_PER_MORPH: i32 = -20;
72
73 pub fn parse(
74 buf: &[u8],
75 original_offset: usize,
76 has_synonym_group_ids: bool,
77 ) -> SudachiResult<Lexicon> {
78 let mut offset = original_offset;
79
80 let (_rest, trie_size) = u32_parser_offset(buf, offset)?;
81 offset += 4;
82 let trie_array = trie_array_parser(buf, offset, trie_size)?;
83 let trie = Trie::new(trie_array, trie_size as usize);
84 offset += trie.total_size();
85
86 let (_rest, word_id_table_size) = u32_parser_offset(buf, offset)?;
87 let word_id_table = WordIdTable::new(buf, word_id_table_size, offset + 4);
88 offset += word_id_table.storage_size();
89
90 let (_rest, word_params_size) = u32_parser_offset(buf, offset)?;
91 let word_params = WordParams::new(buf, word_params_size, offset + 4);
92 offset += word_params.storage_size();
93
94 let word_infos = WordInfos::new(buf, offset, word_params.size(), has_synonym_group_ids);
95
96 Ok(Lexicon {
97 trie,
98 word_id_table,
99 word_params,
100 word_infos,
101 lex_id: u8::MAX,
102 })
103 }
104
105 pub fn set_dic_id(&mut self, id: u8) {
107 assert!(id < MAX_DICTIONARIES as u8);
108 self.lex_id = id
109 }
110
111 #[inline]
112 fn word_id(&self, raw_id: u32) -> WordId {
113 WordId::new(self.lex_id, raw_id)
114 }
115
116 #[inline]
118 pub fn lookup(
119 &'a self,
120 input: &'a [u8],
121 offset: usize,
122 ) -> impl Iterator<Item = LexiconEntry> + 'a {
123 debug_assert!(self.lex_id < MAX_DICTIONARIES as u8);
124 self.trie
125 .common_prefix_iterator(input, offset)
126 .flat_map(move |e| {
127 self.word_id_table
128 .entries(e.value as usize)
129 .map(move |wid| LexiconEntry::new(self.word_id(wid), e.end))
130 })
131 }
132
133 pub fn get_word_info(&self, word_id: u32, subset: InfoSubset) -> SudachiResult<WordInfo> {
137 self.word_infos.get_word_info(word_id, subset)
138 }
139
140 #[inline]
143 pub fn get_word_param(&self, word_id: u32) -> (i16, i16, i16) {
144 self.word_params.get_params(word_id)
145 }
146
147 pub fn update_cost<D: DictionaryAccess>(&mut self, dict: &D) -> SudachiResult<()> {
149 let mut tok = StatefulTokenizer::create(dict, false, Mode::C);
150 let mut ms = MorphemeList::empty(dict);
151 for wid in 0..self.word_params.size() {
152 if self.word_params.get_cost(wid) != i16::MIN {
153 continue;
154 }
155 let wi = self.get_word_info(wid, InfoSubset::SURFACE)?;
156 tok.reset().push_str(wi.surface());
157 tok.do_tokenize()?;
158 ms.collect_results(&mut tok)?;
159 let internal_cost = ms.get_internal_cost();
160 let cost = internal_cost + Lexicon::USER_DICT_COST_PER_MORPH * ms.len() as i32;
161 let cost = cmp::min(cost, i16::MAX as i32);
162 let cost = cmp::max(cost, i16::MIN as i32);
163 self.word_params.set_cost(wid, cost as i16);
164 }
165
166 Ok(())
167 }
168
169 pub fn size(&self) -> u32 {
170 self.word_params.size()
171 }
172}
173
174fn u32_parser_offset(input: &[u8], offset: usize) -> SudachiNomResult<&[u8], u32> {
175 nom::sequence::preceded(take(offset), le_u32)(input)
176}
177
178fn trie_array_parser(input: &[u8], offset: usize, trie_size: u32) -> SudachiResult<&[u8]> {
179 let trie_start = offset;
180 let trie_end = offset + (trie_size as usize) * size_of::<u32>();
181 if input.len() < trie_start {
182 return Err(SudachiError::InvalidRange(trie_start, trie_end));
183 }
184 if input.len() < trie_end {
185 return Err(SudachiError::InvalidRange(trie_start, trie_end));
186 }
187 let trie_data = &input[trie_start..trie_end];
188 Ok(trie_data)
189}