sudachi/
sentence_detector.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use fancy_regex::Regex;
18use lazy_static::lazy_static;
19use std::cmp::Ordering;
20
21use crate::dic::lexicon_set::LexiconSet;
22use crate::prelude::*;
23
24/// A checker for words that cross boundaries
25pub struct NonBreakChecker<'a> {
26    lexicon: &'a LexiconSet<'a>,
27    pub bos: usize,
28}
29impl<'a> NonBreakChecker<'a> {
30    pub fn new(lexicon: &'a LexiconSet<'a>) -> Self {
31        NonBreakChecker { lexicon, bos: 0 }
32    }
33}
34
35impl NonBreakChecker<'_> {
36    /// Returns whether there is a word that crosses the boundary
37
38    fn has_non_break_word(&self, input: &str, length: usize) -> bool {
39        // assume that SentenceDetector::get_eos called with self.input[self.bos..]
40        let eos_byte = self.bos + length;
41        let input_bytes = input.as_bytes();
42        const LOOKUP_BYTE_LENGTH: usize = 10 * 3; // 10 Japanese characters in UTF-8
43        let lookup_start = std::cmp::max(LOOKUP_BYTE_LENGTH, eos_byte) - LOOKUP_BYTE_LENGTH;
44        for i in lookup_start..eos_byte {
45            for entry in self.lexicon.lookup(input_bytes, i) {
46                let end_byte = entry.end;
47                // handling cases like モーニング娘。
48                match end_byte.cmp(&eos_byte) {
49                    // end is after than boundary candidate, this boundary is bad
50                    Ordering::Greater => return true,
51                    // end is on boundary candidate,
52                    // check that there are more than one character in the matched word
53                    Ordering::Equal => return input[i..].chars().take(2).count() > 1,
54                    _ => {}
55                }
56            }
57        }
58        false
59    }
60}
61
62const PERIODS: &str = "。?!♪…\\?\\!";
63const DOT: &str = "\\..";
64const CDOTS: &str = "・{3,}";
65const COMMA: &str = ",,、";
66const BR_TAG: &str = "(<br>|<BR>){2,}";
67const ALPHABET_OR_NUMBER: &str = "a-zA-Z0-9a-zA-Z0-9〇一二三四五六七八九十百千万億兆";
68const OPEN_PARENTHESIS: &str = "\\(\\{{\\[(「【『[≪〔“";
69const CLOSE_PARENTHESIS: &str = "\\)\\}\\])」}】』]〕≫”";
70
71const DEFAULT_LIMIT: usize = 4096;
72
73/// A sentence boundary detector
74pub struct SentenceDetector {
75    // The maximum number of characters processed at once
76    limit: usize,
77}
78
79impl Default for SentenceDetector {
80    fn default() -> Self {
81        Self::new()
82    }
83}
84
85impl SentenceDetector {
86    pub fn new() -> Self {
87        SentenceDetector {
88            limit: DEFAULT_LIMIT,
89        }
90    }
91    pub fn with_limit(limit: usize) -> Self {
92        SentenceDetector { limit }
93    }
94
95    /// Returns the byte index of the detected end of the sentence.
96    ///
97    /// If NonBreakChecker is given, it is used to determine if there is a
98    /// word that crosses the detected boundary, and if so, the next boundary is
99    /// returned.
100    ///
101    /// If there is no boundary, this returns a relatively harmles boundary as a
102    /// negative value.
103    ///
104    /// # Examples
105    ///
106    /// ```
107    /// let sd = sudachi::sentence_detector::SentenceDetector::new();
108    /// assert_eq!(12, sd.get_eos("あいう。えお", None).unwrap());
109    /// assert_eq!(-15, sd.get_eos("あいうえお", None).unwrap());
110    /// ```
111    pub fn get_eos(&self, input: &str, checker: Option<&NonBreakChecker>) -> SudachiResult<isize> {
112        if input.is_empty() {
113            return Ok(0);
114        }
115
116        // handle at most self.limit chars at once
117        let s: String = input.chars().take(self.limit).collect();
118        let input_exceeds_limit = s.len() < input.len();
119
120        lazy_static! {
121            static ref SENTENCE_BREAKER: Regex = Regex::new(&format!(
122                "([{}]|{}+|(?<![{}])[{}](?![{}{}]))[{}{}]*|{}",
123                PERIODS,
124                CDOTS,
125                ALPHABET_OR_NUMBER,
126                DOT,
127                ALPHABET_OR_NUMBER,
128                COMMA,
129                DOT,
130                PERIODS,
131                BR_TAG
132            ))
133            .unwrap();
134            static ref ITEMIZE_HEADER: Regex =
135                Regex::new(&format!("^([{}])([{}])$", ALPHABET_OR_NUMBER, DOT)).unwrap();
136        }
137
138        for mat in SENTENCE_BREAKER.find_iter(&s) {
139            // check if we can split at the match
140            let mut eos = mat?.end();
141            if parenthesis_level(&s[..eos])? > 0 {
142                continue;
143            }
144            if eos < s.len() {
145                eos += prohibited_bos(&s[eos..])?;
146            }
147            if ITEMIZE_HEADER.is_match(&s)? {
148                continue;
149            }
150            if eos < s.len() && is_continuous_phrase(&s, eos)? {
151                continue;
152            }
153            if let Some(ck) = checker {
154                if ck.has_non_break_word(input, eos) {
155                    continue;
156                }
157            }
158            return Ok(eos as isize);
159        }
160
161        if input_exceeds_limit {
162            // search the final whitespace as a provisional split.
163            lazy_static! {
164                static ref SPACES: Regex = Regex::new(".+\\s+").unwrap();
165            }
166            if let Some(mat) = SPACES.find(&s)? {
167                return Ok(-(mat.end() as isize));
168            }
169        }
170
171        Ok(-(s.len() as isize))
172    }
173}
174
175/// Returns the count of non-closed open parentheses remaining at the end of input.
176fn parenthesis_level(s: &str) -> SudachiResult<usize> {
177    lazy_static! {
178        static ref PARENTHESIS: Regex = Regex::new(&format!(
179            "([{}])|([{}])",
180            OPEN_PARENTHESIS, CLOSE_PARENTHESIS
181        ))
182        .unwrap();
183    }
184    let mut level: usize = 0;
185    for caps in PARENTHESIS.captures_iter(s) {
186        if caps?.get(1).is_some() {
187            // open
188            level += 1;
189        } else {
190            level = level.saturating_sub(1);
191        }
192    }
193    Ok(level)
194}
195
196/// Returns a byte length of chars at the beggining of str, which cannot be a bos
197fn prohibited_bos(s: &str) -> SudachiResult<usize> {
198    lazy_static! {
199        static ref PROHIBITED_BOS: Regex = Regex::new(&format!(
200            "\\A([{}{}{}])+",
201            CLOSE_PARENTHESIS, COMMA, PERIODS
202        ))
203        .unwrap();
204    }
205
206    if let Some(mat) = PROHIBITED_BOS.find(s)? {
207        Ok(mat.end())
208    } else {
209        Ok(0)
210    }
211}
212
213// Returns if eos is the middle of phrase
214fn is_continuous_phrase(s: &str, eos: usize) -> SudachiResult<bool> {
215    lazy_static! {
216        static ref QUOTE_MARKER: Regex = Regex::new(&format!(
217            "(!|?|\\!|\\?|[{}])(と|っ|です)",
218            CLOSE_PARENTHESIS
219        ))
220        .unwrap();
221        static ref EOS_ITEMIZE_HEADER: Regex =
222            Regex::new(&format!("([{}])([{}])\\z", ALPHABET_OR_NUMBER, DOT)).unwrap();
223    }
224
225    // we can safely unwrap since eos > 0
226    let last_char_len = s[..eos].chars().last().unwrap().to_string().len();
227    if let Some(mat) = QUOTE_MARKER.find(&s[(eos - last_char_len)..])? {
228        if mat.start() == 0 {
229            return Ok(true);
230        }
231    }
232
233    // we can safely unwrap since eos < s.len()
234    let c = s[eos..].chars().next().unwrap();
235    Ok((c == 'と' || c == 'や' || c == 'の') && EOS_ITEMIZE_HEADER.is_match(&s[..eos])?)
236}
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241
242    #[test]
243    fn get_eos() {
244        let sd = SentenceDetector::new();
245        assert_eq!(sd.get_eos("あいうえお。", None).unwrap(), 18);
246        assert_eq!(sd.get_eos("あいう。えお。", None).unwrap(), 12);
247        assert_eq!(sd.get_eos("あいう。。えお。", None).unwrap(), 15);
248        assert_eq!(sd.get_eos("あいうえお", None).unwrap(), -15);
249        assert_eq!(sd.get_eos("あいう えお。", None).unwrap(), 19);
250        assert_eq!(sd.get_eos("あいう えお", None).unwrap(), -16);
251        assert_eq!(sd.get_eos("", None).unwrap(), 0);
252    }
253
254    #[test]
255    fn get_eos_with_limit() {
256        let sd = SentenceDetector::with_limit(5);
257        assert_eq!(sd.get_eos("あいうえおか。", None).unwrap(), -15);
258        assert_eq!(sd.get_eos("あい。うえお。", None).unwrap(), 9);
259        assert_eq!(sd.get_eos("あいうえ", None).unwrap(), -12);
260        assert_eq!(sd.get_eos("あい うえお", None).unwrap(), -7);
261        assert_eq!(sd.get_eos("あ い うえお", None).unwrap(), -8);
262    }
263
264    #[test]
265    fn get_eos_with_period() {
266        let sd = SentenceDetector::new();
267        assert_eq!(sd.get_eos("あいう.えお", None).unwrap(), 10);
268        assert_eq!(sd.get_eos("3.141", None).unwrap(), -5);
269        assert_eq!(sd.get_eos("四百十.〇", None).unwrap(), -13);
270    }
271
272    #[test]
273    fn get_eos_with_many_periods() {
274        let sd = SentenceDetector::new();
275        assert_eq!(sd.get_eos("あいうえお!??", None).unwrap(), 18);
276    }
277
278    #[test]
279    fn get_eos_with_parentheses() {
280        let sd = SentenceDetector::new();
281        assert_eq!(sd.get_eos("あ(いう。え)お", None).unwrap(), -24);
282        assert_eq!(sd.get_eos("(あ(いう)。え)お", None).unwrap(), -30);
283        assert_eq!(sd.get_eos("あ(いう)。えお", None).unwrap(), 18);
284    }
285
286    #[test]
287    fn get_eos_with_itemize_header() {
288        let sd = SentenceDetector::new();
289        assert_eq!(sd.get_eos("1. あいう。えお", None).unwrap(), 15);
290    }
291
292    #[test]
293    fn get_eos_with_prohibited_bos() {
294        let sd = SentenceDetector::new();
295        assert_eq!(sd.get_eos("あいう?えお", None).unwrap(), 10);
296        assert_eq!(sd.get_eos("あいう?)えお", None).unwrap(), 11);
297        assert_eq!(sd.get_eos("あいう?,えお", None).unwrap(), 11);
298    }
299
300    #[test]
301    fn get_eos_with_continuous_phrase() {
302        let sd = SentenceDetector::new();
303        assert_eq!(sd.get_eos("あいう?です。", None).unwrap(), 19);
304        assert_eq!(sd.get_eos("あいう?って。", None).unwrap(), 19);
305        assert_eq!(sd.get_eos("あいう?という。", None).unwrap(), 22);
306        assert_eq!(sd.get_eos("あいう?の?です。", None).unwrap(), 10);
307
308        assert_eq!(sd.get_eos("1.と2.が。", None).unwrap(), 13);
309        assert_eq!(sd.get_eos("1.やb.から。", None).unwrap(), 16);
310        assert_eq!(sd.get_eos("1.の12.が。", None).unwrap(), 14);
311    }
312}