sudachi/dic/lexicon/
word_infos.rs

1/*
2 * Copyright (c) 2021 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::iter::FusedIterator;
18
19use crate::dic::lexicon_set::LexiconSet;
20use crate::dic::read::u32_parser;
21use crate::dic::read::word_info::WordInfoParser;
22use crate::dic::subset::InfoSubset;
23use crate::dic::word_id::WordId;
24use crate::prelude::*;
25
26pub struct WordInfos<'a> {
27    bytes: &'a [u8],
28    offset: usize,
29    _word_size: u32,
30    has_synonym_group_ids: bool,
31}
32
33impl<'a> WordInfos<'a> {
34    pub fn new(
35        bytes: &'a [u8],
36        offset: usize,
37        _word_size: u32,
38        has_synonym_group_ids: bool,
39    ) -> WordInfos {
40        WordInfos {
41            bytes,
42            offset,
43            _word_size,
44            has_synonym_group_ids,
45        }
46    }
47
48    fn word_id_to_offset(&self, word_id: u32) -> SudachiResult<usize> {
49        Ok(u32_parser(&self.bytes[self.offset + (4 * word_id as usize)..])?.1 as usize)
50    }
51
52    fn parse_word_info(&self, word_id: u32, subset: InfoSubset) -> SudachiResult<WordInfoData> {
53        let index = self.word_id_to_offset(word_id)?;
54        let parser = WordInfoParser::subset(subset);
55        parser.parse(&self.bytes[index..])
56    }
57
58    pub fn get_word_info(&self, word_id: u32, mut subset: InfoSubset) -> SudachiResult<WordInfo> {
59        if !self.has_synonym_group_ids {
60            subset -= InfoSubset::SYNONYM_GROUP_ID;
61        }
62
63        let mut word_info = self.parse_word_info(word_id, subset)?;
64
65        // consult dictionary form
66        let dfwi = word_info.dictionary_form_word_id;
67        if (dfwi >= 0) && (dfwi != word_id as i32) {
68            let inner = self.parse_word_info(dfwi as u32, InfoSubset::SURFACE)?;
69            word_info.dictionary_form = inner.surface;
70        };
71
72        Ok(word_info.into())
73    }
74}
75
76/// Internal storage of the WordInfo.
77/// It is not accessible by default, but a WordInfo can be created from it:
78/// `let wi: WordInfo = data.into();`
79///
80/// String fields CAN be empty, in this case the value of the surface field should be used instead
81#[derive(Clone, Debug, Default)]
82pub struct WordInfoData {
83    pub surface: String,
84    pub head_word_length: u16,
85    pub pos_id: u16,
86    pub normalized_form: String,
87    pub dictionary_form_word_id: i32,
88    pub dictionary_form: String,
89    pub reading_form: String,
90    pub a_unit_split: Vec<WordId>,
91    pub b_unit_split: Vec<WordId>,
92    pub word_structure: Vec<WordId>,
93    pub synonym_group_ids: Vec<u32>,
94}
95
96/// WordInfo API.
97///
98/// Internal data is not accessible by default, but can be extracted as
99/// `let data: WordInfoData = info.into()`.
100/// Note: this will consume WordInfo.
101#[derive(Clone, Default)]
102#[repr(transparent)]
103pub struct WordInfo {
104    data: WordInfoData,
105}
106
107impl WordInfo {
108    pub fn surface(&self) -> &str {
109        &self.data.surface
110    }
111
112    pub fn head_word_length(&self) -> usize {
113        self.data.head_word_length as usize
114    }
115
116    pub fn pos_id(&self) -> u16 {
117        self.data.pos_id
118    }
119
120    pub fn normalized_form(&self) -> &str {
121        if self.data.normalized_form.is_empty() {
122            self.surface()
123        } else {
124            &self.data.normalized_form
125        }
126    }
127
128    pub fn dictionary_form_word_id(&self) -> i32 {
129        self.data.dictionary_form_word_id
130    }
131
132    pub fn dictionary_form(&self) -> &str {
133        if self.data.dictionary_form.is_empty() {
134            self.surface()
135        } else {
136            &self.data.dictionary_form
137        }
138    }
139
140    pub fn reading_form(&self) -> &str {
141        if self.data.reading_form.is_empty() {
142            self.surface()
143        } else {
144            &self.data.reading_form
145        }
146    }
147
148    pub fn a_unit_split(&self) -> &[WordId] {
149        &self.data.a_unit_split
150    }
151
152    pub fn b_unit_split(&self) -> &[WordId] {
153        &self.data.b_unit_split
154    }
155
156    pub fn word_structure(&self) -> &[WordId] {
157        &self.data.word_structure
158    }
159
160    pub fn synonym_group_ids(&self) -> &[u32] {
161        &self.data.synonym_group_ids
162    }
163
164    pub fn borrow_data(&self) -> &WordInfoData {
165        &self.data
166    }
167}
168
169impl From<WordInfoData> for WordInfo {
170    fn from(data: WordInfoData) -> Self {
171        WordInfo { data }
172    }
173}
174
175impl From<WordInfo> for WordInfoData {
176    fn from(info: WordInfo) -> Self {
177        info.data
178    }
179}
180
181struct SplitIter<'a> {
182    index: usize,
183    split: &'a [WordId],
184    lexicon: &'a LexiconSet<'a>,
185}
186
187impl Iterator for SplitIter<'_> {
188    type Item = SudachiResult<WordInfo>;
189
190    fn next(&mut self) -> Option<Self::Item> {
191        let idx = self.index;
192        if idx >= self.split.len() {
193            None
194        } else {
195            self.index += 1;
196            Some(self.lexicon.get_word_info(self.split[idx]))
197        }
198    }
199
200    fn size_hint(&self) -> (usize, Option<usize>) {
201        let rem = self.split.len() - self.index;
202        (rem, Some(rem))
203    }
204}
205
206impl FusedIterator for SplitIter<'_> {}