sudachi/analysis/
mlist.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::analysis::morpheme::Morpheme;
18use crate::analysis::node::{PathCost, ResultNode};
19use crate::analysis::stateful_tokenizer::StatefulTokenizer;
20use crate::analysis::stateless_tokenizer::DictionaryAccess;
21use crate::analysis::{Mode, Node};
22use crate::dic::subset::InfoSubset;
23use crate::error::{SudachiError, SudachiResult};
24use crate::input_text::InputBuffer;
25use std::cell::{Ref, RefCell};
26use std::iter::FusedIterator;
27use std::ops::{Deref, DerefMut, Index};
28use std::rc::Rc;
29
30struct InputPart {
31    input: InputBuffer,
32    subset: InfoSubset,
33}
34
35impl Default for InputPart {
36    fn default() -> Self {
37        let mut input = InputBuffer::new();
38        input.start_build().unwrap();
39        Self {
40            input,
41            subset: Default::default(),
42        }
43    }
44}
45
46#[derive(Default)]
47struct Nodes {
48    data: Vec<ResultNode>,
49}
50
51impl Nodes {
52    fn mut_data(&mut self) -> &mut Vec<ResultNode> {
53        &mut self.data
54    }
55}
56
57pub struct MorphemeList<T> {
58    dict: T,
59    input: Rc<RefCell<InputPart>>,
60    nodes: Nodes,
61}
62
63impl<T: DictionaryAccess> MorphemeList<T> {
64    /// Returns an empty morpheme list
65    pub fn empty(dict: T) -> Self {
66        let input = Default::default();
67        Self {
68            dict,
69            input: Rc::new(RefCell::new(input)),
70            nodes: Default::default(),
71        }
72    }
73
74    /// Creates MorphemeList from components
75    pub fn from_components(
76        dict: T,
77        input: InputBuffer,
78        path: Vec<ResultNode>,
79        subset: InfoSubset,
80    ) -> Self {
81        let input = InputPart { input, subset };
82        Self {
83            dict,
84            input: Rc::new(RefCell::new(input)),
85            nodes: Nodes { data: path },
86        }
87    }
88
89    pub fn collect_results<U: DictionaryAccess>(
90        &mut self,
91        analyzer: &mut StatefulTokenizer<U>,
92    ) -> SudachiResult<()> {
93        match self.input.try_borrow_mut() {
94            Ok(mut i) => {
95                let mref = i.deref_mut();
96                analyzer.swap_result(&mut mref.input, self.nodes.mut_data(), &mut mref.subset);
97                Ok(())
98            }
99            Err(_) => Err(SudachiError::MorphemeListBorrowed),
100        }
101    }
102
103    /// Splits morphemes and writes them into the resulting list
104    /// The resulting list is _not_ cleared before that
105    /// Returns true if split produced more than two elements
106    pub fn split_into(&self, mode: Mode, index: usize, out: &mut Self) -> SudachiResult<bool> {
107        let node = self.node(index);
108        let num_splits = node.num_splits(mode);
109
110        if num_splits == 0 {
111            Ok(false)
112        } else {
113            out.assign_input(self);
114            let data = out.nodes.mut_data();
115            let input = self.input();
116            let subset = self.subset();
117            data.reserve(num_splits);
118            for n in node.split(mode, self.dict().lexicon(), subset, input.deref()) {
119                data.push(n);
120            }
121            Ok(true)
122        }
123    }
124
125    /// Clears morphemes from analysis result
126    pub fn clear(&mut self) {
127        self.nodes.mut_data().clear();
128    }
129
130    pub fn len(&self) -> usize {
131        self.nodes.data.len()
132    }
133
134    pub fn is_empty(&self) -> bool {
135        self.nodes.data.is_empty()
136    }
137
138    pub fn get(&self, idx: usize) -> Morpheme<T> {
139        return Morpheme::for_list(self, idx);
140    }
141
142    pub fn surface(&self) -> Ref<str> {
143        let inp = self.input();
144        Ref::map(inp, |i| i.original())
145    }
146
147    pub fn iter(&self) -> MorphemeIter<T> {
148        MorphemeIter {
149            index: 0,
150            list: self,
151        }
152    }
153
154    /// Gets the whole cost of the path
155    pub fn get_internal_cost(&self) -> i32 {
156        let len = self.len();
157        if len == 0 {
158            return 0;
159        }
160
161        let first_node = self.node(0);
162        let last_node = self.node(len - 1);
163        last_node.total_cost() - first_node.total_cost()
164    }
165
166    pub(crate) fn node(&self, idx: usize) -> &ResultNode {
167        self.nodes.data.index(idx)
168    }
169
170    pub fn dict(&self) -> &T {
171        &self.dict
172    }
173
174    pub(crate) fn input(&self) -> Ref<InputBuffer> {
175        Ref::map(self.input.deref().borrow(), |x| &x.input)
176    }
177
178    /// Makes this point to the input of another MorphemeList
179    pub(crate) fn assign_input(&mut self, other: &Self) {
180        if self.input.as_ptr() != other.input.as_ptr() {
181            self.input = other.input.clone();
182        }
183    }
184
185    pub fn subset(&self) -> InfoSubset {
186        self.input.deref().borrow().subset
187    }
188
189    pub fn copy_slice(&self, start: usize, end: usize, out: &mut Self) {
190        let out_data = out.nodes.mut_data();
191        out_data.extend_from_slice(&self.nodes.data[start..end]);
192    }
193
194    pub fn lookup(&mut self, query: &str, subset: InfoSubset) -> SudachiResult<usize> {
195        let end_chars = {
196            let input = &mut self.input.borrow_mut().input;
197            input.reset().push_str(query);
198            input.start_build()?;
199            input.build(self.dict.grammar())?;
200            input.ch_idx(query.len())
201        };
202
203        let mut result = 0;
204        let lex = self.dict.lexicon();
205        for entry in lex.lookup(query.as_bytes(), 0) {
206            if entry.end != query.len() {
207                continue;
208            }
209            let info = lex.get_word_info_subset(entry.word_id, subset)?;
210            let node = Node::new(0, end_chars as _, 0, 0, 0, entry.word_id);
211            self.nodes
212                .data
213                .push(ResultNode::new(node, 0, 0, query.len() as _, info));
214            result += 1;
215        }
216        Ok(result)
217    }
218}
219
220impl<T: DictionaryAccess + Clone> MorphemeList<T> {
221    pub fn empty_clone(&self) -> Self {
222        Self {
223            dict: self.dict.clone(),
224            input: self.input.clone(),
225            nodes: Default::default(),
226        }
227    }
228
229    /// Returns a new morpheme list splitting the morpheme with a given mode.
230    /// Returns an empty list if there was no splits
231    #[deprecated(note = "use split_into", since = "0.6.1")]
232    pub fn split(&self, mode: Mode, index: usize) -> SudachiResult<MorphemeList<T>> {
233        let mut list = self.empty_clone();
234        if !self.split_into(mode, index, &mut list)? {
235            list.nodes.mut_data().push(self.node(index).clone())
236        }
237        Ok(list)
238    }
239}
240
241/// Iterates over morpheme list
242pub struct MorphemeIter<'a, T> {
243    list: &'a MorphemeList<T>,
244    index: usize,
245}
246
247impl<'a, T: DictionaryAccess> Iterator for MorphemeIter<'a, T> {
248    type Item = Morpheme<'a, T>;
249
250    fn next(&mut self) -> Option<Self::Item> {
251        if self.index >= self.list.len() {
252            return None;
253        }
254
255        let morpheme = Morpheme::for_list(self.list, self.index);
256
257        self.index += 1;
258        Some(morpheme)
259    }
260
261    fn size_hint(&self) -> (usize, Option<usize>) {
262        let rem = self.list.len() - self.index;
263        (rem, Some(rem))
264    }
265}
266
267impl<'a, T: DictionaryAccess> FusedIterator for MorphemeIter<'a, T> {}
268
269impl<'a, T: DictionaryAccess> ExactSizeIterator for MorphemeIter<'a, T> {
270    fn len(&self) -> usize {
271        self.size_hint().0
272    }
273}