sudachi/dic/
word_id.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::dic::lexicon_set::LexiconSetError;
18use crate::error::{SudachiError, SudachiResult};
19use std::fmt::{Debug, Display, Formatter};
20
21/// Dictionary word ID
22///
23/// Encode dictionary ID and word internal ID as 4 bits and 28 bits respectively
24/// DicId 0 - system dictionary
25/// DicId 15 - OOV and other special nodes
26#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
27#[repr(transparent)]
28pub struct WordId {
29    raw: u32,
30}
31
32impl Debug for WordId {
33    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
34        Display::fmt(self, f)
35    }
36}
37
38impl Display for WordId {
39    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
40        let fmtdic = if self.is_oov() { -1 } else { self.dic() as i32 };
41        write!(f, "({}, {})", fmtdic, self.word())
42    }
43}
44
45const WORD_MASK: u32 = 0x0fff_ffff;
46
47impl WordId {
48    /// Create WordId from the compressed representation
49    pub const fn from_raw(raw: u32) -> WordId {
50        WordId { raw }
51    }
52
53    /// Create WordId from parts
54    pub fn new(dic: u8, word: u32) -> WordId {
55        debug_assert_eq!(word & (!WORD_MASK), 0);
56        debug_assert_eq!(dic & (!0xf), 0);
57        let dic_part = ((dic & 0xf) as u32) << 28;
58        let word_part = word & WORD_MASK;
59        let raw = dic_part | word_part;
60        Self::from_raw(raw)
61    }
62
63    /// Creates the WordId with correctness checking
64    pub fn checked(dic: u8, word: u32) -> SudachiResult<WordId> {
65        if dic & !0xf != 0 {
66            return Err(SudachiError::LexiconSetError(
67                LexiconSetError::TooLargeDictionaryId(dic as usize),
68            ));
69        }
70
71        if word & !WORD_MASK != 0 {
72            return Err(SudachiError::LexiconSetError(
73                LexiconSetError::TooLargeWordId(word, WORD_MASK as usize),
74            ));
75        }
76
77        Ok(Self::new(dic, word))
78    }
79
80    /// Creates an OOV node for pos_id
81    pub fn oov(pos_id: u32) -> WordId {
82        Self::new(0xf, pos_id)
83    }
84
85    /// Extract Dictionary ID
86    pub fn dic(&self) -> u8 {
87        (self.raw >> 28) as u8
88    }
89
90    /// Extract Word ID
91    pub fn word(&self) -> u32 {
92        self.raw & WORD_MASK
93    }
94
95    /// Check if the word comes from the system dictionary
96    pub fn is_system(&self) -> bool {
97        self.dic() == 0
98    }
99
100    /// Check if the word comes from the user dictionary
101    pub fn is_user(&self) -> bool {
102        !matches!(self.dic(), 0 | 0xf)
103    }
104
105    pub fn as_raw(&self) -> u32 {
106        self.raw
107    }
108
109    /// Check if the word is OOV
110    /// An OOV node can come of OOV handlers or be a special system node like BOS or EOS
111    pub fn is_oov(&self) -> bool {
112        self.dic() == 0xf
113    }
114
115    /// Checks if the WordId corresponds to a special node
116    pub fn is_special(&self) -> bool {
117        self >= &Self::EOS && self < &Self::INVALID
118    }
119
120    pub const INVALID: WordId = WordId::from_raw(0xffff_ffff);
121    pub const BOS: WordId = WordId::from_raw(0xffff_fffe);
122    pub const EOS: WordId = WordId::from_raw(0xffff_fffd);
123    pub const MAX_WORD: u32 = 0x0fff_ffff;
124}
125
126#[cfg(test)]
127mod test {
128    use super::*;
129
130    fn assert_create(dic: u8, word: u32) {
131        let id = WordId::new(dic, word);
132        assert_eq!(dic, id.dic());
133        assert_eq!(word, id.word());
134    }
135
136    #[test]
137    fn create() {
138        assert_create(0, 0);
139        assert_create(0, 1);
140        assert_create(0, 0x0fffffff);
141        assert_create(14, 0x0fffffff);
142        assert_create(1, 0);
143        assert_create(1, 0x0fffffff);
144        assert_create(15, 3121);
145        assert_create(15, 0);
146        assert_create(15, 0x0fffffff);
147    }
148
149    #[test]
150    fn display() {
151        let id1 = WordId::new(0, 521321);
152        assert_eq!("(0, 521321)", format!("{}", id1));
153    }
154
155    #[test]
156    fn debug() {
157        let id1 = WordId::new(0, 521321);
158        assert_eq!("(0, 521321)", format!("{:?}", id1));
159    }
160
161    #[test]
162    fn is_system() {
163        assert!(WordId::new(0, 0).is_system());
164        assert!(!WordId::new(1, 0).is_system());
165        assert!(!WordId::new(14, 0).is_system());
166        assert!(!WordId::new(15, 0).is_system());
167    }
168
169    #[test]
170    fn is_user() {
171        assert!(!WordId::new(0, 0).is_user());
172        assert!(WordId::new(1, 0).is_user());
173        assert!(WordId::new(14, 0).is_user());
174        assert!(!WordId::new(15, 0).is_user());
175    }
176
177    #[test]
178    fn is_oov() {
179        assert!(!WordId::new(0, 0).is_oov());
180        assert!(!WordId::new(1, 0).is_oov());
181        assert!(!WordId::new(14, 0).is_oov());
182        assert!(WordId::new(15, 0).is_oov());
183    }
184
185    #[test]
186    fn is_special() {
187        assert!(WordId::EOS.is_special());
188        assert!(WordId::BOS.is_special());
189        assert!(!WordId::INVALID.is_special());
190        assert!(!WordId::new(0, 0).is_special());
191    }
192}