sudachi/dic/lexicon/
word_infos.rs1use std::iter::FusedIterator;
18
19use crate::dic::lexicon_set::LexiconSet;
20use crate::dic::read::u32_parser;
21use crate::dic::read::word_info::WordInfoParser;
22use crate::dic::subset::InfoSubset;
23use crate::dic::word_id::WordId;
24use crate::prelude::*;
25
26pub struct WordInfos<'a> {
27 bytes: &'a [u8],
28 offset: usize,
29 _word_size: u32,
30 has_synonym_group_ids: bool,
31}
32
33impl<'a> WordInfos<'a> {
34 pub fn new(
35 bytes: &'a [u8],
36 offset: usize,
37 _word_size: u32,
38 has_synonym_group_ids: bool,
39 ) -> WordInfos {
40 WordInfos {
41 bytes,
42 offset,
43 _word_size,
44 has_synonym_group_ids,
45 }
46 }
47
48 fn word_id_to_offset(&self, word_id: u32) -> SudachiResult<usize> {
49 Ok(u32_parser(&self.bytes[self.offset + (4 * word_id as usize)..])?.1 as usize)
50 }
51
52 fn parse_word_info(&self, word_id: u32, subset: InfoSubset) -> SudachiResult<WordInfoData> {
53 let index = self.word_id_to_offset(word_id)?;
54 let parser = WordInfoParser::subset(subset);
55 parser.parse(&self.bytes[index..])
56 }
57
58 pub fn get_word_info(&self, word_id: u32, mut subset: InfoSubset) -> SudachiResult<WordInfo> {
59 if !self.has_synonym_group_ids {
60 subset -= InfoSubset::SYNONYM_GROUP_ID;
61 }
62
63 let mut word_info = self.parse_word_info(word_id, subset)?;
64
65 let dfwi = word_info.dictionary_form_word_id;
67 if (dfwi >= 0) && (dfwi != word_id as i32) {
68 let inner = self.parse_word_info(dfwi as u32, InfoSubset::SURFACE)?;
69 word_info.dictionary_form = inner.surface;
70 };
71
72 Ok(word_info.into())
73 }
74}
75
76#[derive(Clone, Debug, Default)]
82pub struct WordInfoData {
83 pub surface: String,
84 pub head_word_length: u16,
85 pub pos_id: u16,
86 pub normalized_form: String,
87 pub dictionary_form_word_id: i32,
88 pub dictionary_form: String,
89 pub reading_form: String,
90 pub a_unit_split: Vec<WordId>,
91 pub b_unit_split: Vec<WordId>,
92 pub word_structure: Vec<WordId>,
93 pub synonym_group_ids: Vec<u32>,
94}
95
96#[derive(Clone, Default)]
102#[repr(transparent)]
103pub struct WordInfo {
104 data: WordInfoData,
105}
106
107impl WordInfo {
108 pub fn surface(&self) -> &str {
109 &self.data.surface
110 }
111
112 pub fn head_word_length(&self) -> usize {
113 self.data.head_word_length as usize
114 }
115
116 pub fn pos_id(&self) -> u16 {
117 self.data.pos_id
118 }
119
120 pub fn normalized_form(&self) -> &str {
121 if self.data.normalized_form.is_empty() {
122 self.surface()
123 } else {
124 &self.data.normalized_form
125 }
126 }
127
128 pub fn dictionary_form_word_id(&self) -> i32 {
129 self.data.dictionary_form_word_id
130 }
131
132 pub fn dictionary_form(&self) -> &str {
133 if self.data.dictionary_form.is_empty() {
134 self.surface()
135 } else {
136 &self.data.dictionary_form
137 }
138 }
139
140 pub fn reading_form(&self) -> &str {
141 if self.data.reading_form.is_empty() {
142 self.surface()
143 } else {
144 &self.data.reading_form
145 }
146 }
147
148 pub fn a_unit_split(&self) -> &[WordId] {
149 &self.data.a_unit_split
150 }
151
152 pub fn b_unit_split(&self) -> &[WordId] {
153 &self.data.b_unit_split
154 }
155
156 pub fn word_structure(&self) -> &[WordId] {
157 &self.data.word_structure
158 }
159
160 pub fn synonym_group_ids(&self) -> &[u32] {
161 &self.data.synonym_group_ids
162 }
163
164 pub fn borrow_data(&self) -> &WordInfoData {
165 &self.data
166 }
167}
168
169impl From<WordInfoData> for WordInfo {
170 fn from(data: WordInfoData) -> Self {
171 WordInfo { data }
172 }
173}
174
175impl From<WordInfo> for WordInfoData {
176 fn from(info: WordInfo) -> Self {
177 info.data
178 }
179}
180
181struct SplitIter<'a> {
182 index: usize,
183 split: &'a [WordId],
184 lexicon: &'a LexiconSet<'a>,
185}
186
187impl Iterator for SplitIter<'_> {
188 type Item = SudachiResult<WordInfo>;
189
190 fn next(&mut self) -> Option<Self::Item> {
191 let idx = self.index;
192 if idx >= self.split.len() {
193 None
194 } else {
195 self.index += 1;
196 Some(self.lexicon.get_word_info(self.split[idx]))
197 }
198 }
199
200 fn size_hint(&self) -> (usize, Option<usize>) {
201 let rem = self.split.len() - self.index;
202 (rem, Some(rem))
203 }
204}
205
206impl FusedIterator for SplitIter<'_> {}