sudachi/
sentence_detector.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/*
 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

use fancy_regex::Regex;
use lazy_static::lazy_static;
use std::cmp::Ordering;

use crate::dic::lexicon_set::LexiconSet;
use crate::prelude::*;

/// A checker for words that cross boundaries
pub struct NonBreakChecker<'a> {
    lexicon: &'a LexiconSet<'a>,
    pub bos: usize,
}
impl<'a> NonBreakChecker<'a> {
    pub fn new(lexicon: &'a LexiconSet<'a>) -> Self {
        NonBreakChecker { lexicon, bos: 0 }
    }
}

impl NonBreakChecker<'_> {
    /// Returns whether there is a word that crosses the boundary

    fn has_non_break_word(&self, input: &str, length: usize) -> bool {
        // assume that SentenceDetector::get_eos called with self.input[self.bos..]
        let eos_byte = self.bos + length;
        let input_bytes = input.as_bytes();
        const LOOKUP_BYTE_LENGTH: usize = 10 * 3; // 10 Japanese characters in UTF-8
        let lookup_start = std::cmp::max(LOOKUP_BYTE_LENGTH, eos_byte) - LOOKUP_BYTE_LENGTH;
        for i in lookup_start..eos_byte {
            for entry in self.lexicon.lookup(input_bytes, i) {
                let end_byte = entry.end;
                // handling cases like モーニング娘。
                match end_byte.cmp(&eos_byte) {
                    // end is after than boundary candidate, this boundary is bad
                    Ordering::Greater => return true,
                    // end is on boundary candidate,
                    // check that there are more than one character in the matched word
                    Ordering::Equal => return input[i..].chars().take(2).count() > 1,
                    _ => {}
                }
            }
        }
        false
    }
}

const PERIODS: &str = "。?!♪…\\?\\!";
const DOT: &str = "\\..";
const CDOTS: &str = "・{3,}";
const COMMA: &str = ",,、";
const BR_TAG: &str = "(<br>|<BR>){2,}";
const ALPHABET_OR_NUMBER: &str = "a-zA-Z0-9a-zA-Z0-9〇一二三四五六七八九十百千万億兆";
const OPEN_PARENTHESIS: &str = "\\(\\{{\\[(「【『[≪〔“";
const CLOSE_PARENTHESIS: &str = "\\)\\}\\])」}】』]〕≫”";

const DEFAULT_LIMIT: usize = 4096;

/// A sentence boundary detector
pub struct SentenceDetector {
    // The maximum number of characters processed at once
    limit: usize,
}

impl Default for SentenceDetector {
    fn default() -> Self {
        Self::new()
    }
}

impl SentenceDetector {
    pub fn new() -> Self {
        SentenceDetector {
            limit: DEFAULT_LIMIT,
        }
    }
    pub fn with_limit(limit: usize) -> Self {
        SentenceDetector { limit }
    }

    /// Returns the byte index of the detected end of the sentence.
    ///
    /// If NonBreakChecker is given, it is used to determine if there is a
    /// word that crosses the detected boundary, and if so, the next boundary is
    /// returned.
    ///
    /// If there is no boundary, this returns a relatively harmles boundary as a
    /// negative value.
    ///
    /// # Examples
    ///
    /// ```
    /// let sd = sudachi::sentence_detector::SentenceDetector::new();
    /// assert_eq!(12, sd.get_eos("あいう。えお", None).unwrap());
    /// assert_eq!(-15, sd.get_eos("あいうえお", None).unwrap());
    /// ```
    pub fn get_eos(&self, input: &str, checker: Option<&NonBreakChecker>) -> SudachiResult<isize> {
        if input.is_empty() {
            return Ok(0);
        }

        // handle at most self.limit chars at once
        let s: String = input.chars().take(self.limit).collect();
        let input_exceeds_limit = s.len() < input.len();

        lazy_static! {
            static ref SENTENCE_BREAKER: Regex = Regex::new(&format!(
                "([{}]|{}+|(?<![{}])[{}](?![{}{}]))[{}{}]*|{}",
                PERIODS,
                CDOTS,
                ALPHABET_OR_NUMBER,
                DOT,
                ALPHABET_OR_NUMBER,
                COMMA,
                DOT,
                PERIODS,
                BR_TAG
            ))
            .unwrap();
            static ref ITEMIZE_HEADER: Regex =
                Regex::new(&format!("^([{}])([{}])$", ALPHABET_OR_NUMBER, DOT)).unwrap();
        }

        for mat in SENTENCE_BREAKER.find_iter(&s) {
            // check if we can split at the match
            let mut eos = mat?.end();
            if parenthesis_level(&s[..eos])? > 0 {
                continue;
            }
            if eos < s.len() {
                eos += prohibited_bos(&s[eos..])?;
            }
            if ITEMIZE_HEADER.is_match(&s)? {
                continue;
            }
            if eos < s.len() && is_continuous_phrase(&s, eos)? {
                continue;
            }
            if let Some(ck) = checker {
                if ck.has_non_break_word(input, eos) {
                    continue;
                }
            }
            return Ok(eos as isize);
        }

        if input_exceeds_limit {
            // search the final whitespace as a provisional split.
            lazy_static! {
                static ref SPACES: Regex = Regex::new(".+\\s+").unwrap();
            }
            if let Some(mat) = SPACES.find(&s)? {
                return Ok(-(mat.end() as isize));
            }
        }

        Ok(-(s.len() as isize))
    }
}

/// Returns the count of non-closed open parentheses remaining at the end of input.
fn parenthesis_level(s: &str) -> SudachiResult<usize> {
    lazy_static! {
        static ref PARENTHESIS: Regex = Regex::new(&format!(
            "([{}])|([{}])",
            OPEN_PARENTHESIS, CLOSE_PARENTHESIS
        ))
        .unwrap();
    }
    let mut level: usize = 0;
    for caps in PARENTHESIS.captures_iter(s) {
        if caps?.get(1).is_some() {
            // open
            level += 1;
        } else {
            level = level.saturating_sub(1);
        }
    }
    Ok(level)
}

/// Returns a byte length of chars at the beggining of str, which cannot be a bos
fn prohibited_bos(s: &str) -> SudachiResult<usize> {
    lazy_static! {
        static ref PROHIBITED_BOS: Regex = Regex::new(&format!(
            "\\A([{}{}{}])+",
            CLOSE_PARENTHESIS, COMMA, PERIODS
        ))
        .unwrap();
    }

    if let Some(mat) = PROHIBITED_BOS.find(s)? {
        Ok(mat.end())
    } else {
        Ok(0)
    }
}

// Returns if eos is the middle of phrase
fn is_continuous_phrase(s: &str, eos: usize) -> SudachiResult<bool> {
    lazy_static! {
        static ref QUOTE_MARKER: Regex = Regex::new(&format!(
            "(!|?|\\!|\\?|[{}])(と|っ|です)",
            CLOSE_PARENTHESIS
        ))
        .unwrap();
        static ref EOS_ITEMIZE_HEADER: Regex =
            Regex::new(&format!("([{}])([{}])\\z", ALPHABET_OR_NUMBER, DOT)).unwrap();
    }

    // we can safely unwrap since eos > 0
    let last_char_len = s[..eos].chars().last().unwrap().to_string().len();
    if let Some(mat) = QUOTE_MARKER.find(&s[(eos - last_char_len)..])? {
        if mat.start() == 0 {
            return Ok(true);
        }
    }

    // we can safely unwrap since eos < s.len()
    let c = s[eos..].chars().next().unwrap();
    Ok((c == 'と' || c == 'や' || c == 'の') && EOS_ITEMIZE_HEADER.is_match(&s[..eos])?)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn get_eos() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("あいうえお。", None).unwrap(), 18);
        assert_eq!(sd.get_eos("あいう。えお。", None).unwrap(), 12);
        assert_eq!(sd.get_eos("あいう。。えお。", None).unwrap(), 15);
        assert_eq!(sd.get_eos("あいうえお", None).unwrap(), -15);
        assert_eq!(sd.get_eos("あいう えお。", None).unwrap(), 19);
        assert_eq!(sd.get_eos("あいう えお", None).unwrap(), -16);
        assert_eq!(sd.get_eos("", None).unwrap(), 0);
    }

    #[test]
    fn get_eos_with_limit() {
        let sd = SentenceDetector::with_limit(5);
        assert_eq!(sd.get_eos("あいうえおか。", None).unwrap(), -15);
        assert_eq!(sd.get_eos("あい。うえお。", None).unwrap(), 9);
        assert_eq!(sd.get_eos("あいうえ", None).unwrap(), -12);
        assert_eq!(sd.get_eos("あい うえお", None).unwrap(), -7);
        assert_eq!(sd.get_eos("あ い うえお", None).unwrap(), -8);
    }

    #[test]
    fn get_eos_with_period() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("あいう.えお", None).unwrap(), 10);
        assert_eq!(sd.get_eos("3.141", None).unwrap(), -5);
        assert_eq!(sd.get_eos("四百十.〇", None).unwrap(), -13);
    }

    #[test]
    fn get_eos_with_many_periods() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("あいうえお!??", None).unwrap(), 18);
    }

    #[test]
    fn get_eos_with_parentheses() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("あ(いう。え)お", None).unwrap(), -24);
        assert_eq!(sd.get_eos("(あ(いう)。え)お", None).unwrap(), -30);
        assert_eq!(sd.get_eos("あ(いう)。えお", None).unwrap(), 18);
    }

    #[test]
    fn get_eos_with_itemize_header() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("1. あいう。えお", None).unwrap(), 15);
    }

    #[test]
    fn get_eos_with_prohibited_bos() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("あいう?えお", None).unwrap(), 10);
        assert_eq!(sd.get_eos("あいう?)えお", None).unwrap(), 11);
        assert_eq!(sd.get_eos("あいう?,えお", None).unwrap(), 11);
    }

    #[test]
    fn get_eos_with_continuous_phrase() {
        let sd = SentenceDetector::new();
        assert_eq!(sd.get_eos("あいう?です。", None).unwrap(), 19);
        assert_eq!(sd.get_eos("あいう?って。", None).unwrap(), 19);
        assert_eq!(sd.get_eos("あいう?という。", None).unwrap(), 22);
        assert_eq!(sd.get_eos("あいう?の?です。", None).unwrap(), 10);

        assert_eq!(sd.get_eos("1.と2.が。", None).unwrap(), 13);
        assert_eq!(sd.get_eos("1.やb.から。", None).unwrap(), 16);
        assert_eq!(sd.get_eos("1.の12.が。", None).unwrap(), 14);
    }
}