sudachi/analysis/
morpheme.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::analysis::node::{LatticeNode, PathCost, ResultNode};
18use crate::analysis::stateless_tokenizer::DictionaryAccess;
19use crate::dic::lexicon::word_infos::WordInfo;
20use crate::dic::word_id::WordId;
21use crate::input_text::InputTextIndex;
22use crate::prelude::*;
23use std::cell::Ref;
24
25/// A morpheme (basic semantic unit of language)
26pub struct Morpheme<'a, T> {
27    list: &'a MorphemeList<T>,
28    index: usize,
29}
30
31impl<T: DictionaryAccess> Morpheme<'_, T> {
32    /// Returns the part of speech
33    pub fn part_of_speech(&self) -> &[String] {
34        self.list
35            .dict()
36            .grammar()
37            .pos_components(self.part_of_speech_id())
38    }
39}
40
41impl<T: DictionaryAccess + Clone> Morpheme<'_, T> {
42    /// Returns new morpheme list splitting the morpheme with given mode.
43    #[deprecated(note = "use split_into", since = "0.6.1")]
44    pub fn split(&self, mode: Mode) -> SudachiResult<MorphemeList<T>> {
45        #[allow(deprecated)]
46        self.list.split(mode, self.index)
47    }
48}
49
50impl<'a, T: DictionaryAccess> Morpheme<'a, T> {
51    pub(crate) fn for_list(list: &'a MorphemeList<T>, index: usize) -> Self {
52        Morpheme { list, index }
53    }
54
55    #[inline]
56    pub(crate) fn node(&self) -> &ResultNode {
57        self.list.node(self.index)
58    }
59
60    /// Returns the begin index in bytes of the morpheme in the original text
61    pub fn begin(&self) -> usize {
62        self.list.input().to_orig_byte_idx(self.node().begin())
63    }
64
65    /// Returns the end index in bytes of the morpheme in the original text
66    pub fn end(&self) -> usize {
67        self.list.input().to_orig_byte_idx(self.node().end())
68    }
69
70    /// Returns the codepoint offset of the morpheme begin in the original text
71    pub fn begin_c(&self) -> usize {
72        self.list.input().to_orig_char_idx(self.node().begin())
73    }
74
75    /// Returns the codepoint offset of the morpheme begin in the original text
76    pub fn end_c(&self) -> usize {
77        self.list.input().to_orig_char_idx(self.node().end())
78    }
79
80    /// Returns a substring of the original text which corresponds to the morpheme
81    pub fn surface(&self) -> Ref<str> {
82        let inp = self.list.input();
83        Ref::map(inp, |i| i.orig_slice(self.node().bytes_range()))
84    }
85
86    pub fn part_of_speech_id(&self) -> u16 {
87        self.node().word_info().pos_id()
88    }
89
90    /// Returns the dictionary form of morpheme
91    ///
92    /// "Dictionary form" means a word's lemma and "終止形" in Japanese.
93    pub fn dictionary_form(&self) -> &str {
94        self.get_word_info().dictionary_form()
95    }
96
97    /// Returns the normalized form of morpheme
98    ///
99    /// This method returns the form normalizing inconsistent spellings and inflected forms
100    pub fn normalized_form(&self) -> &str {
101        self.get_word_info().normalized_form()
102    }
103
104    /// Returns the reading form of morpheme.
105    ///
106    /// Returns Japanese syllabaries 'フリガナ' in katakana.
107    pub fn reading_form(&self) -> &str {
108        self.get_word_info().reading_form()
109    }
110
111    /// Returns if this morpheme is out of vocabulary
112    pub fn is_oov(&self) -> bool {
113        self.word_id().is_oov()
114    }
115
116    /// Returns the word id of morpheme
117    pub fn word_id(&self) -> WordId {
118        self.node().word_id()
119    }
120
121    /// Returns the dictionary id where the morpheme belongs
122    ///
123    /// Returns -1 if the morpheme is oov
124    pub fn dictionary_id(&self) -> i32 {
125        let wid = self.word_id();
126        if wid.is_oov() {
127            -1
128        } else {
129            wid.dic() as i32
130        }
131    }
132
133    pub fn synonym_group_ids(&self) -> &[u32] {
134        self.get_word_info().synonym_group_ids()
135    }
136
137    pub fn get_word_info(&self) -> &WordInfo {
138        self.node().word_info()
139    }
140
141    /// Returns the index of this morpheme
142    pub fn index(&self) -> usize {
143        self.index
144    }
145
146    /// Splits morpheme and writes sub-morphemes into the provided list.
147    /// The resulting list is _not_ cleared before that.
148    /// Returns true if split has produced any elements.
149    pub fn split_into(&self, mode: Mode, out: &mut MorphemeList<T>) -> SudachiResult<bool> {
150        self.list.split_into(mode, self.index, out)
151    }
152
153    /// Returns total cost from the beginning of the path
154    pub fn total_cost(&self) -> i32 {
155        return self.node().total_cost();
156    }
157}
158
159impl<T: DictionaryAccess> std::fmt::Debug for Morpheme<'_, T> {
160    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161        f.debug_struct("Morpheme")
162            .field("surface", &self.surface())
163            .field("pos", &self.part_of_speech())
164            .field("normalized_form", &self.normalized_form())
165            .field("reading_form", &self.reading_form())
166            .field("dictionary_form", &self.dictionary_form())
167            .finish()
168    }
169}