sudachi/
sentence_splitter.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::dic::lexicon_set::LexiconSet;
18use crate::sentence_detector::{NonBreakChecker, SentenceDetector};
19use std::ops::Range;
20
21pub trait SplitSentences {
22    fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b>;
23}
24
25pub struct SentenceIter<'s, 'x> {
26    splitter: &'x SentenceDetector,
27    checker: Option<&'x NonBreakChecker<'x>>,
28    data: &'s str,
29    position: usize,
30}
31
32impl<'s, 'x> Iterator for SentenceIter<'s, 'x> {
33    type Item = (Range<usize>, &'s str);
34
35    fn next(&mut self) -> Option<Self::Item> {
36        if self.position == self.data.len() {
37            return None;
38        }
39        let slice = &self.data[self.position..];
40        let rv = self.splitter.get_eos(slice, self.checker).unwrap();
41        let end = if rv < 0 {
42            self.data.len()
43        } else {
44            self.position + rv as usize
45        };
46
47        let range = self.position..end;
48        let real_slice = &self.data[range.clone()];
49        self.position = end;
50        Some((range, real_slice))
51    }
52}
53
54pub struct SentenceSplitter<'a> {
55    detector: SentenceDetector,
56    checker: Option<NonBreakChecker<'a>>,
57}
58
59impl Default for SentenceSplitter<'_> {
60    fn default() -> Self {
61        Self::new()
62    }
63}
64
65impl SentenceSplitter<'_> {
66    pub fn new() -> Self {
67        SentenceSplitter {
68            detector: SentenceDetector::new(),
69            checker: None,
70        }
71    }
72
73    pub fn with_limit(limit: usize) -> Self {
74        SentenceSplitter {
75            detector: SentenceDetector::with_limit(limit),
76            checker: None,
77        }
78    }
79
80    pub fn with_checker<'a>(self, lexicon: &'a LexiconSet<'a>) -> SentenceSplitter<'a> {
81        let checker = NonBreakChecker::new(lexicon);
82        SentenceSplitter {
83            detector: self.detector,
84            checker: Some(checker),
85        }
86    }
87}
88
89impl SplitSentences for SentenceSplitter<'_> {
90    fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b> {
91        SentenceIter {
92            data,
93            position: 0,
94            splitter: &self.detector,
95            checker: self.checker.as_ref(),
96        }
97    }
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    #[test]
105    fn split_simple() {
106        let splitter = SentenceSplitter::new();
107        let mut iter = splitter.split("テスト。テスト");
108        assert_eq!(iter.next(), Some((0..12, "テスト。")));
109        assert_eq!(iter.next(), Some((12..21, "テスト")));
110        assert_eq!(iter.next(), None);
111    }
112
113    #[test]
114    fn split_longer_sentence() {
115        let splitter = SentenceSplitter::new();
116        let mut iter = splitter.split(" 振り返って見ると白い物! 女が軒下で招いている。");
117        assert_eq!(
118            iter.next(),
119            Some((0..39, "\u{3000}振り返って見ると白い物!"))
120        );
121        assert_eq!(
122            iter.next(),
123            Some((39..75, "\u{3000}女が軒下で招いている。"))
124        );
125        assert_eq!(iter.next(), None)
126    }
127}