sudachi/
sentence_splitter.rs1use crate::dic::lexicon_set::LexiconSet;
18use crate::sentence_detector::{NonBreakChecker, SentenceDetector};
19use std::ops::Range;
20
21pub trait SplitSentences {
22 fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b>;
23}
24
25pub struct SentenceIter<'s, 'x> {
26 splitter: &'x SentenceDetector,
27 checker: Option<&'x NonBreakChecker<'x>>,
28 data: &'s str,
29 position: usize,
30}
31
32impl<'s, 'x> Iterator for SentenceIter<'s, 'x> {
33 type Item = (Range<usize>, &'s str);
34
35 fn next(&mut self) -> Option<Self::Item> {
36 if self.position == self.data.len() {
37 return None;
38 }
39 let slice = &self.data[self.position..];
40 let rv = self.splitter.get_eos(slice, self.checker).unwrap();
41 let end = if rv < 0 {
42 self.data.len()
43 } else {
44 self.position + rv as usize
45 };
46
47 let range = self.position..end;
48 let real_slice = &self.data[range.clone()];
49 self.position = end;
50 Some((range, real_slice))
51 }
52}
53
54pub struct SentenceSplitter<'a> {
55 detector: SentenceDetector,
56 checker: Option<NonBreakChecker<'a>>,
57}
58
59impl Default for SentenceSplitter<'_> {
60 fn default() -> Self {
61 Self::new()
62 }
63}
64
65impl SentenceSplitter<'_> {
66 pub fn new() -> Self {
67 SentenceSplitter {
68 detector: SentenceDetector::new(),
69 checker: None,
70 }
71 }
72
73 pub fn with_limit(limit: usize) -> Self {
74 SentenceSplitter {
75 detector: SentenceDetector::with_limit(limit),
76 checker: None,
77 }
78 }
79
80 pub fn with_checker<'a>(self, lexicon: &'a LexiconSet<'a>) -> SentenceSplitter<'a> {
81 let checker = NonBreakChecker::new(lexicon);
82 SentenceSplitter {
83 detector: self.detector,
84 checker: Some(checker),
85 }
86 }
87}
88
89impl SplitSentences for SentenceSplitter<'_> {
90 fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b> {
91 SentenceIter {
92 data,
93 position: 0,
94 splitter: &self.detector,
95 checker: self.checker.as_ref(),
96 }
97 }
98}
99
100#[cfg(test)]
101mod tests {
102 use super::*;
103
104 #[test]
105 fn split_simple() {
106 let splitter = SentenceSplitter::new();
107 let mut iter = splitter.split("テスト。テスト");
108 assert_eq!(iter.next(), Some((0..12, "テスト。")));
109 assert_eq!(iter.next(), Some((12..21, "テスト")));
110 assert_eq!(iter.next(), None);
111 }
112
113 #[test]
114 fn split_longer_sentence() {
115 let splitter = SentenceSplitter::new();
116 let mut iter = splitter.split(" 振り返って見ると白い物! 女が軒下で招いている。");
117 assert_eq!(
118 iter.next(),
119 Some((0..39, "\u{3000}振り返って見ると白い物!"))
120 );
121 assert_eq!(
122 iter.next(),
123 Some((39..75, "\u{3000}女が軒下で招いている。"))
124 );
125 assert_eq!(iter.next(), None)
126 }
127}