sudachi/dic/
lexicon_set.rs1use thiserror::Error;
18
19use crate::dic::lexicon::word_infos::{WordInfo, WordInfoData};
20use crate::dic::lexicon::{Lexicon, LexiconEntry, MAX_DICTIONARIES};
21use crate::dic::subset::InfoSubset;
22use crate::dic::word_id::WordId;
23use crate::prelude::*;
24
25#[derive(Error, Debug, Eq, PartialEq)]
27pub enum LexiconSetError {
28 #[error("too large word_id {0} in dict {1}")]
29 TooLargeWordId(u32, usize),
30
31 #[error("too large dictionary_id {0}")]
32 TooLargeDictionaryId(usize),
33
34 #[error("too many user dictionaries")]
35 TooManyDictionaries,
36}
37
38pub struct LexiconSet<'a> {
43 lexicons: Vec<Lexicon<'a>>,
44 pos_offsets: Vec<usize>,
45 num_system_pos: usize,
46}
47
48impl<'a> LexiconSet<'a> {
49 pub fn new(mut system_lexicon: Lexicon, num_system_pos: usize) -> LexiconSet {
53 system_lexicon.set_dic_id(0);
54 LexiconSet {
55 lexicons: vec![system_lexicon],
56 pos_offsets: vec![0],
57 num_system_pos,
58 }
59 }
60
61 pub fn append(
65 &mut self,
66 mut lexicon: Lexicon<'a>,
67 pos_offset: usize,
68 ) -> Result<(), LexiconSetError> {
69 if self.is_full() {
70 return Err(LexiconSetError::TooManyDictionaries);
71 }
72 lexicon.set_dic_id(self.lexicons.len() as u8);
73 self.lexicons.push(lexicon);
74 self.pos_offsets.push(pos_offset);
75 Ok(())
76 }
77
78 pub fn is_full(&self) -> bool {
80 self.lexicons.len() >= MAX_DICTIONARIES
81 }
82}
83
84impl LexiconSet<'_> {
85 #[inline]
89 pub fn lookup<'b>(
90 &'b self,
91 input: &'b [u8],
92 offset: usize,
93 ) -> impl Iterator<Item = LexiconEntry> + 'b {
94 self.lexicons
96 .iter()
97 .rev()
98 .flat_map(move |l| l.lookup(input, offset))
99 }
100
101 pub fn get_word_info(&self, id: WordId) -> SudachiResult<WordInfo> {
103 self.get_word_info_subset(id, InfoSubset::all())
104 }
105
106 pub fn get_word_info_subset(&self, id: WordId, subset: InfoSubset) -> SudachiResult<WordInfo> {
110 let dict_id = id.dic();
111 let mut word_info: WordInfoData = self.lexicons[dict_id as usize]
112 .get_word_info(id.word(), subset)?
113 .into();
114
115 if subset.contains(InfoSubset::POS_ID) {
116 let pos_id = word_info.pos_id as usize;
117 if dict_id > 0 && pos_id >= self.num_system_pos {
118 word_info.pos_id =
120 (pos_id - self.num_system_pos + self.pos_offsets[dict_id as usize]) as u16;
121 }
122 }
123
124 if subset.contains(InfoSubset::SPLIT_A) {
125 Self::update_dict_id(&mut word_info.a_unit_split, dict_id)?;
126 }
127
128 if subset.contains(InfoSubset::SPLIT_B) {
129 Self::update_dict_id(&mut word_info.b_unit_split, dict_id)?;
130 }
131
132 if subset.contains(InfoSubset::WORD_STRUCTURE) {
133 Self::update_dict_id(&mut word_info.word_structure, dict_id)?;
134 }
135
136 Ok(word_info.into())
137 }
138
139 pub fn get_word_param(&self, id: WordId) -> (i16, i16, i16) {
141 let dic_id = id.dic() as usize;
142 self.lexicons[dic_id].get_word_param(id.word())
143 }
144
145 fn update_dict_id(split: &mut Vec<WordId>, dict_id: u8) -> SudachiResult<()> {
146 for id in split.iter_mut() {
147 let cur_dict_id = id.dic();
148 if cur_dict_id > 0 {
149 *id = WordId::checked(dict_id, id.word())?;
151 }
152 }
153 Ok(())
154 }
155
156 pub fn size(&self) -> u32 {
157 self.lexicons.iter().fold(0, |acc, lex| acc + lex.size())
158 }
159}