sudachi/analysis/
created.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use std::cmp::min;
18
19type Carrier = u64;
20
21/// Bitset which represents that a word of a specified length was created.
22/// Lattice construction fills this bitmap and passes it to the OOV providers.
23/// It allows OOV providers to check if a word of a specific length was created very cheaply.
24///
25/// Unfortunately, if a word is more than `MAX_VALUE` characters, handlers need to do usual linear-time check.
26#[derive(Copy, Clone, Eq, PartialEq, Default, Debug)]
27#[repr(transparent)]
28pub struct CreatedWords(Carrier);
29
30#[derive(Eq, PartialEq, Copy, Clone, Debug)]
31pub enum HasWord {
32    Yes,
33    No,
34    Maybe,
35}
36
37impl CreatedWords {
38    /// Maximum supported length of the word
39    pub const MAX_VALUE: Carrier = 64;
40    const MAX_SHIFT: Carrier = CreatedWords::MAX_VALUE - 1;
41
42    pub fn empty() -> CreatedWords {
43        Default::default()
44    }
45
46    pub fn single<Pos: Into<i64>>(length: Pos) -> CreatedWords {
47        let raw = length.into();
48        debug_assert!(raw > 0);
49        let raw = raw as Carrier;
50        let shift = min(raw.saturating_sub(1), CreatedWords::MAX_SHIFT);
51        let bits = (1 as Carrier) << shift;
52        CreatedWords(bits)
53    }
54
55    #[must_use]
56    pub fn add_word<P: Into<i64>>(&self, length: P) -> CreatedWords {
57        let mask = CreatedWords::single(length);
58        self.add(mask)
59    }
60
61    #[must_use]
62    pub fn add(&self, other: CreatedWords) -> CreatedWords {
63        CreatedWords(self.0 | other.0)
64    }
65
66    pub fn has_word<P: Into<i64> + Copy>(&self, length: P) -> HasWord {
67        let mask = CreatedWords::single(length);
68        if (self.0 & mask.0) == 0 {
69            HasWord::No
70        } else if length.into() >= CreatedWords::MAX_VALUE as _ {
71            HasWord::Maybe
72        } else {
73            HasWord::Yes
74        }
75    }
76
77    pub fn is_empty(&self) -> bool {
78        self.0 == 0
79    }
80
81    pub fn not_empty(&self) -> bool {
82        !self.is_empty()
83    }
84}
85
86#[cfg(test)]
87mod test {
88    use super::*;
89
90    #[test]
91    fn simple() {
92        let mask = CreatedWords::single(1);
93        assert_eq!(mask.has_word(1), HasWord::Yes);
94    }
95
96    #[test]
97    fn add() {
98        let mask1 = CreatedWords::single(5);
99        let mask2 = mask1.add_word(10);
100        assert_eq!(mask2.has_word(5), HasWord::Yes);
101        assert_eq!(mask2.has_word(10), HasWord::Yes);
102        assert_eq!(mask2.has_word(15), HasWord::No);
103    }
104
105    #[test]
106    fn long_value_present() {
107        let mask1 = CreatedWords::single(100);
108        assert_eq!(HasWord::No, mask1.has_word(62));
109        assert_eq!(HasWord::No, mask1.has_word(63));
110        assert_eq!(HasWord::Maybe, mask1.has_word(64));
111    }
112
113    #[test]
114    fn long_value_absent() {
115        let mask1 = CreatedWords::single(62);
116        assert_eq!(HasWord::Yes, mask1.has_word(62));
117        assert_eq!(HasWord::No, mask1.has_word(63));
118        assert_eq!(HasWord::No, mask1.has_word(64));
119    }
120}