sudachi/analysis/
mlist.rs1use crate::analysis::morpheme::Morpheme;
18use crate::analysis::node::{PathCost, ResultNode};
19use crate::analysis::stateful_tokenizer::StatefulTokenizer;
20use crate::analysis::stateless_tokenizer::DictionaryAccess;
21use crate::analysis::{Mode, Node};
22use crate::dic::subset::InfoSubset;
23use crate::error::{SudachiError, SudachiResult};
24use crate::input_text::InputBuffer;
25use std::cell::{Ref, RefCell};
26use std::iter::FusedIterator;
27use std::ops::{Deref, DerefMut, Index};
28use std::rc::Rc;
29
30struct InputPart {
31 input: InputBuffer,
32 subset: InfoSubset,
33}
34
35impl Default for InputPart {
36 fn default() -> Self {
37 let mut input = InputBuffer::new();
38 input.start_build().unwrap();
39 Self {
40 input,
41 subset: Default::default(),
42 }
43 }
44}
45
46#[derive(Default)]
47struct Nodes {
48 data: Vec<ResultNode>,
49}
50
51impl Nodes {
52 fn mut_data(&mut self) -> &mut Vec<ResultNode> {
53 &mut self.data
54 }
55}
56
57pub struct MorphemeList<T> {
58 dict: T,
59 input: Rc<RefCell<InputPart>>,
60 nodes: Nodes,
61}
62
63impl<T: DictionaryAccess> MorphemeList<T> {
64 pub fn empty(dict: T) -> Self {
66 let input = Default::default();
67 Self {
68 dict,
69 input: Rc::new(RefCell::new(input)),
70 nodes: Default::default(),
71 }
72 }
73
74 pub fn from_components(
76 dict: T,
77 input: InputBuffer,
78 path: Vec<ResultNode>,
79 subset: InfoSubset,
80 ) -> Self {
81 let input = InputPart { input, subset };
82 Self {
83 dict,
84 input: Rc::new(RefCell::new(input)),
85 nodes: Nodes { data: path },
86 }
87 }
88
89 pub fn collect_results<U: DictionaryAccess>(
90 &mut self,
91 analyzer: &mut StatefulTokenizer<U>,
92 ) -> SudachiResult<()> {
93 match self.input.try_borrow_mut() {
94 Ok(mut i) => {
95 let mref = i.deref_mut();
96 analyzer.swap_result(&mut mref.input, self.nodes.mut_data(), &mut mref.subset);
97 Ok(())
98 }
99 Err(_) => Err(SudachiError::MorphemeListBorrowed),
100 }
101 }
102
103 pub fn split_into(&self, mode: Mode, index: usize, out: &mut Self) -> SudachiResult<bool> {
107 let node = self.node(index);
108 let num_splits = node.num_splits(mode);
109
110 if num_splits == 0 {
111 Ok(false)
112 } else {
113 out.assign_input(self);
114 let data = out.nodes.mut_data();
115 let input = self.input();
116 let subset = self.subset();
117 data.reserve(num_splits);
118 for n in node.split(mode, self.dict().lexicon(), subset, input.deref()) {
119 data.push(n);
120 }
121 Ok(true)
122 }
123 }
124
125 pub fn clear(&mut self) {
127 self.nodes.mut_data().clear();
128 }
129
130 pub fn len(&self) -> usize {
131 self.nodes.data.len()
132 }
133
134 pub fn is_empty(&self) -> bool {
135 self.nodes.data.is_empty()
136 }
137
138 pub fn get(&self, idx: usize) -> Morpheme<T> {
139 return Morpheme::for_list(self, idx);
140 }
141
142 pub fn surface(&self) -> Ref<str> {
143 let inp = self.input();
144 Ref::map(inp, |i| i.original())
145 }
146
147 pub fn iter(&self) -> MorphemeIter<T> {
148 MorphemeIter {
149 index: 0,
150 list: self,
151 }
152 }
153
154 pub fn get_internal_cost(&self) -> i32 {
156 let len = self.len();
157 if len == 0 {
158 return 0;
159 }
160
161 let first_node = self.node(0);
162 let last_node = self.node(len - 1);
163 last_node.total_cost() - first_node.total_cost()
164 }
165
166 pub(crate) fn node(&self, idx: usize) -> &ResultNode {
167 self.nodes.data.index(idx)
168 }
169
170 pub fn dict(&self) -> &T {
171 &self.dict
172 }
173
174 pub(crate) fn input(&self) -> Ref<InputBuffer> {
175 Ref::map(self.input.deref().borrow(), |x| &x.input)
176 }
177
178 pub(crate) fn assign_input(&mut self, other: &Self) {
180 if self.input.as_ptr() != other.input.as_ptr() {
181 self.input = other.input.clone();
182 }
183 }
184
185 pub fn subset(&self) -> InfoSubset {
186 self.input.deref().borrow().subset
187 }
188
189 pub fn copy_slice(&self, start: usize, end: usize, out: &mut Self) {
190 let out_data = out.nodes.mut_data();
191 out_data.extend_from_slice(&self.nodes.data[start..end]);
192 }
193
194 pub fn lookup(&mut self, query: &str, subset: InfoSubset) -> SudachiResult<usize> {
195 let end_chars = {
196 let input = &mut self.input.borrow_mut().input;
197 input.reset().push_str(query);
198 input.start_build()?;
199 input.build(self.dict.grammar())?;
200 input.ch_idx(query.len())
201 };
202
203 let mut result = 0;
204 let lex = self.dict.lexicon();
205 for entry in lex.lookup(query.as_bytes(), 0) {
206 if entry.end != query.len() {
207 continue;
208 }
209 let info = lex.get_word_info_subset(entry.word_id, subset)?;
210 let node = Node::new(0, end_chars as _, 0, 0, 0, entry.word_id);
211 self.nodes
212 .data
213 .push(ResultNode::new(node, 0, 0, query.len() as _, info));
214 result += 1;
215 }
216 Ok(result)
217 }
218}
219
220impl<T: DictionaryAccess + Clone> MorphemeList<T> {
221 pub fn empty_clone(&self) -> Self {
222 Self {
223 dict: self.dict.clone(),
224 input: self.input.clone(),
225 nodes: Default::default(),
226 }
227 }
228
229 #[deprecated(note = "use split_into", since = "0.6.1")]
232 pub fn split(&self, mode: Mode, index: usize) -> SudachiResult<MorphemeList<T>> {
233 let mut list = self.empty_clone();
234 if !self.split_into(mode, index, &mut list)? {
235 list.nodes.mut_data().push(self.node(index).clone())
236 }
237 Ok(list)
238 }
239}
240
241pub struct MorphemeIter<'a, T> {
243 list: &'a MorphemeList<T>,
244 index: usize,
245}
246
247impl<'a, T: DictionaryAccess> Iterator for MorphemeIter<'a, T> {
248 type Item = Morpheme<'a, T>;
249
250 fn next(&mut self) -> Option<Self::Item> {
251 if self.index >= self.list.len() {
252 return None;
253 }
254
255 let morpheme = Morpheme::for_list(self.list, self.index);
256
257 self.index += 1;
258 Some(morpheme)
259 }
260
261 fn size_hint(&self) -> (usize, Option<usize>) {
262 let rem = self.list.len() - self.index;
263 (rem, Some(rem))
264 }
265}
266
267impl<'a, T: DictionaryAccess> FusedIterator for MorphemeIter<'a, T> {}
268
269impl<'a, T: DictionaryAccess> ExactSizeIterator for MorphemeIter<'a, T> {
270 fn len(&self) -> usize {
271 self.size_hint().0
272 }
273}