sudachi/
sentence_splitter.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

use crate::dic::lexicon_set::LexiconSet;
use crate::sentence_detector::{NonBreakChecker, SentenceDetector};
use std::ops::Range;

pub trait SplitSentences {
    fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b>;
}

pub struct SentenceIter<'s, 'x> {
    splitter: &'x SentenceDetector,
    checker: Option<&'x NonBreakChecker<'x>>,
    data: &'s str,
    position: usize,
}

impl<'s, 'x> Iterator for SentenceIter<'s, 'x> {
    type Item = (Range<usize>, &'s str);

    fn next(&mut self) -> Option<Self::Item> {
        if self.position == self.data.len() {
            return None;
        }
        let slice = &self.data[self.position..];
        let rv = self.splitter.get_eos(slice, self.checker).unwrap();
        let end = if rv < 0 {
            self.data.len()
        } else {
            self.position + rv as usize
        };

        let range = self.position..end;
        let real_slice = &self.data[range.clone()];
        self.position = end;
        Some((range, real_slice))
    }
}

pub struct SentenceSplitter<'a> {
    detector: SentenceDetector,
    checker: Option<NonBreakChecker<'a>>,
}

impl Default for SentenceSplitter<'_> {
    fn default() -> Self {
        Self::new()
    }
}

impl SentenceSplitter<'_> {
    pub fn new() -> Self {
        SentenceSplitter {
            detector: SentenceDetector::new(),
            checker: None,
        }
    }

    pub fn with_limit(limit: usize) -> Self {
        SentenceSplitter {
            detector: SentenceDetector::with_limit(limit),
            checker: None,
        }
    }

    pub fn with_checker<'a>(self, lexicon: &'a LexiconSet<'a>) -> SentenceSplitter<'a> {
        let checker = NonBreakChecker::new(lexicon);
        SentenceSplitter {
            detector: self.detector,
            checker: Some(checker),
        }
    }
}

impl SplitSentences for SentenceSplitter<'_> {
    fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b> {
        SentenceIter {
            data,
            position: 0,
            splitter: &self.detector,
            checker: self.checker.as_ref(),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn split_simple() {
        let splitter = SentenceSplitter::new();
        let mut iter = splitter.split("テスト。テスト");
        assert_eq!(iter.next(), Some((0..12, "テスト。")));
        assert_eq!(iter.next(), Some((12..21, "テスト")));
        assert_eq!(iter.next(), None);
    }

    #[test]
    fn split_longer_sentence() {
        let splitter = SentenceSplitter::new();
        let mut iter = splitter.split(" 振り返って見ると白い物! 女が軒下で招いている。");
        assert_eq!(
            iter.next(),
            Some((0..39, "\u{3000}振り返って見ると白い物!"))
        );
        assert_eq!(
            iter.next(),
            Some((39..75, "\u{3000}女が軒下で招いている。"))
        );
        assert_eq!(iter.next(), None)
    }
}