sudachi/dic/lexicon/
mod.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::cmp;
18use std::mem::size_of;
19
20use crate::analysis::stateful_tokenizer::StatefulTokenizer;
21use crate::analysis::stateless_tokenizer::DictionaryAccess;
22use crate::dic::subset::InfoSubset;
23use crate::dic::word_id::WordId;
24use nom::{bytes::complete::take, number::complete::le_u32};
25
26use crate::error::SudachiNomResult;
27use crate::prelude::*;
28
29use self::trie::Trie;
30use self::word_id_table::WordIdTable;
31use self::word_infos::{WordInfo, WordInfos};
32use self::word_params::WordParams;
33
34pub mod trie;
35pub mod word_id_table;
36pub mod word_infos;
37pub mod word_params;
38
39/// The first 4 bits of word_id are used to indicate that from which lexicon
40/// the word comes, thus we can only hold 15 lexicons in the same time.
41/// 16th is reserved for marking OOVs.
42pub const MAX_DICTIONARIES: usize = 15;
43
44/// Dictionary lexicon
45///
46/// Contains trie, word_id, word_param, word_info
47pub struct Lexicon<'a> {
48    trie: Trie<'a>,
49    word_id_table: WordIdTable<'a>,
50    word_params: WordParams<'a>,
51    word_infos: WordInfos<'a>,
52    lex_id: u8,
53}
54
55/// Result of the Lexicon lookup
56#[derive(Eq, PartialEq, Debug)]
57pub struct LexiconEntry {
58    /// Id of the returned word
59    pub word_id: WordId,
60    /// Byte index of the word end
61    pub end: usize,
62}
63
64impl LexiconEntry {
65    pub fn new(word_id: WordId, end: usize) -> LexiconEntry {
66        LexiconEntry { word_id, end }
67    }
68}
69
70impl<'a> Lexicon<'a> {
71    const USER_DICT_COST_PER_MORPH: i32 = -20;
72
73    pub fn parse(
74        buf: &[u8],
75        original_offset: usize,
76        has_synonym_group_ids: bool,
77    ) -> SudachiResult<Lexicon> {
78        let mut offset = original_offset;
79
80        let (_rest, trie_size) = u32_parser_offset(buf, offset)?;
81        offset += 4;
82        let trie_array = trie_array_parser(buf, offset, trie_size)?;
83        let trie = Trie::new(trie_array, trie_size as usize);
84        offset += trie.total_size();
85
86        let (_rest, word_id_table_size) = u32_parser_offset(buf, offset)?;
87        let word_id_table = WordIdTable::new(buf, word_id_table_size, offset + 4);
88        offset += word_id_table.storage_size();
89
90        let (_rest, word_params_size) = u32_parser_offset(buf, offset)?;
91        let word_params = WordParams::new(buf, word_params_size, offset + 4);
92        offset += word_params.storage_size();
93
94        let word_infos = WordInfos::new(buf, offset, word_params.size(), has_synonym_group_ids);
95
96        Ok(Lexicon {
97            trie,
98            word_id_table,
99            word_params,
100            word_infos,
101            lex_id: u8::MAX,
102        })
103    }
104
105    /// Assign lexicon id to the current Lexicon
106    pub fn set_dic_id(&mut self, id: u8) {
107        assert!(id < MAX_DICTIONARIES as u8);
108        self.lex_id = id
109    }
110
111    #[inline]
112    fn word_id(&self, raw_id: u32) -> WordId {
113        WordId::new(self.lex_id, raw_id)
114    }
115
116    /// Returns an iterator of word_id and end of words that matches given input
117    #[inline]
118    pub fn lookup(
119        &'a self,
120        input: &'a [u8],
121        offset: usize,
122    ) -> impl Iterator<Item = LexiconEntry> + 'a {
123        debug_assert!(self.lex_id < MAX_DICTIONARIES as u8);
124        self.trie
125            .common_prefix_iterator(input, offset)
126            .flat_map(move |e| {
127                self.word_id_table
128                    .entries(e.value as usize)
129                    .map(move |wid| LexiconEntry::new(self.word_id(wid), e.end))
130            })
131    }
132
133    /// Returns WordInfo for given word_id
134    ///
135    /// WordInfo will contain only fields included in InfoSubset
136    pub fn get_word_info(&self, word_id: u32, subset: InfoSubset) -> SudachiResult<WordInfo> {
137        self.word_infos.get_word_info(word_id, subset)
138    }
139
140    /// Returns word_param for given word_id.
141    /// Params are (left_id, right_id, cost).
142    #[inline]
143    pub fn get_word_param(&self, word_id: u32) -> (i16, i16, i16) {
144        self.word_params.get_params(word_id)
145    }
146
147    /// update word_param cost based on current tokenizer
148    pub fn update_cost<D: DictionaryAccess>(&mut self, dict: &D) -> SudachiResult<()> {
149        let mut tok = StatefulTokenizer::create(dict, false, Mode::C);
150        let mut ms = MorphemeList::empty(dict);
151        for wid in 0..self.word_params.size() {
152            if self.word_params.get_cost(wid) != i16::MIN {
153                continue;
154            }
155            let wi = self.get_word_info(wid, InfoSubset::SURFACE)?;
156            tok.reset().push_str(wi.surface());
157            tok.do_tokenize()?;
158            ms.collect_results(&mut tok)?;
159            let internal_cost = ms.get_internal_cost();
160            let cost = internal_cost + Lexicon::USER_DICT_COST_PER_MORPH * ms.len() as i32;
161            let cost = cmp::min(cost, i16::MAX as i32);
162            let cost = cmp::max(cost, i16::MIN as i32);
163            self.word_params.set_cost(wid, cost as i16);
164        }
165
166        Ok(())
167    }
168
169    pub fn size(&self) -> u32 {
170        self.word_params.size()
171    }
172}
173
174fn u32_parser_offset(input: &[u8], offset: usize) -> SudachiNomResult<&[u8], u32> {
175    nom::sequence::preceded(take(offset), le_u32)(input)
176}
177
178fn trie_array_parser(input: &[u8], offset: usize, trie_size: u32) -> SudachiResult<&[u8]> {
179    let trie_start = offset;
180    let trie_end = offset + (trie_size as usize) * size_of::<u32>();
181    if input.len() < trie_start {
182        return Err(SudachiError::InvalidRange(trie_start, trie_end));
183    }
184    if input.len() < trie_end {
185        return Err(SudachiError::InvalidRange(trie_start, trie_end));
186    }
187    let trie_data = &input[trie_start..trie_end];
188    Ok(trie_data)
189}