1use fancy_regex::Regex;
18use lazy_static::lazy_static;
19use std::cmp::Ordering;
20
21use crate::dic::lexicon_set::LexiconSet;
22use crate::prelude::*;
23
24pub struct NonBreakChecker<'a> {
26 lexicon: &'a LexiconSet<'a>,
27 pub bos: usize,
28}
29impl<'a> NonBreakChecker<'a> {
30 pub fn new(lexicon: &'a LexiconSet<'a>) -> Self {
31 NonBreakChecker { lexicon, bos: 0 }
32 }
33}
34
35impl NonBreakChecker<'_> {
36 fn has_non_break_word(&self, input: &str, length: usize) -> bool {
39 let eos_byte = self.bos + length;
41 let input_bytes = input.as_bytes();
42 const LOOKUP_BYTE_LENGTH: usize = 10 * 3; let lookup_start = std::cmp::max(LOOKUP_BYTE_LENGTH, eos_byte) - LOOKUP_BYTE_LENGTH;
44 for i in lookup_start..eos_byte {
45 for entry in self.lexicon.lookup(input_bytes, i) {
46 let end_byte = entry.end;
47 match end_byte.cmp(&eos_byte) {
49 Ordering::Greater => return true,
51 Ordering::Equal => return input[i..].chars().take(2).count() > 1,
54 _ => {}
55 }
56 }
57 }
58 false
59 }
60}
61
62const PERIODS: &str = "。?!♪…\\?\\!";
63const DOT: &str = "\\..";
64const CDOTS: &str = "・{3,}";
65const COMMA: &str = ",,、";
66const BR_TAG: &str = "(<br>|<BR>){2,}";
67const ALPHABET_OR_NUMBER: &str = "a-zA-Z0-9a-zA-Z0-9〇一二三四五六七八九十百千万億兆";
68const OPEN_PARENTHESIS: &str = "\\(\\{{\\[(「【『[≪〔“";
69const CLOSE_PARENTHESIS: &str = "\\)\\}\\])」}】』]〕≫”";
70
71const DEFAULT_LIMIT: usize = 4096;
72
73pub struct SentenceDetector {
75 limit: usize,
77}
78
79impl Default for SentenceDetector {
80 fn default() -> Self {
81 Self::new()
82 }
83}
84
85impl SentenceDetector {
86 pub fn new() -> Self {
87 SentenceDetector {
88 limit: DEFAULT_LIMIT,
89 }
90 }
91 pub fn with_limit(limit: usize) -> Self {
92 SentenceDetector { limit }
93 }
94
95 pub fn get_eos(&self, input: &str, checker: Option<&NonBreakChecker>) -> SudachiResult<isize> {
112 if input.is_empty() {
113 return Ok(0);
114 }
115
116 let s: String = input.chars().take(self.limit).collect();
118 let input_exceeds_limit = s.len() < input.len();
119
120 lazy_static! {
121 static ref SENTENCE_BREAKER: Regex = Regex::new(&format!(
122 "([{}]|{}+|(?<![{}])[{}](?![{}{}]))[{}{}]*|{}",
123 PERIODS,
124 CDOTS,
125 ALPHABET_OR_NUMBER,
126 DOT,
127 ALPHABET_OR_NUMBER,
128 COMMA,
129 DOT,
130 PERIODS,
131 BR_TAG
132 ))
133 .unwrap();
134 static ref ITEMIZE_HEADER: Regex =
135 Regex::new(&format!("^([{}])([{}])$", ALPHABET_OR_NUMBER, DOT)).unwrap();
136 }
137
138 for mat in SENTENCE_BREAKER.find_iter(&s) {
139 let mut eos = mat?.end();
141 if parenthesis_level(&s[..eos])? > 0 {
142 continue;
143 }
144 if eos < s.len() {
145 eos += prohibited_bos(&s[eos..])?;
146 }
147 if ITEMIZE_HEADER.is_match(&s)? {
148 continue;
149 }
150 if eos < s.len() && is_continuous_phrase(&s, eos)? {
151 continue;
152 }
153 if let Some(ck) = checker {
154 if ck.has_non_break_word(input, eos) {
155 continue;
156 }
157 }
158 return Ok(eos as isize);
159 }
160
161 if input_exceeds_limit {
162 lazy_static! {
164 static ref SPACES: Regex = Regex::new(".+\\s+").unwrap();
165 }
166 if let Some(mat) = SPACES.find(&s)? {
167 return Ok(-(mat.end() as isize));
168 }
169 }
170
171 Ok(-(s.len() as isize))
172 }
173}
174
175fn parenthesis_level(s: &str) -> SudachiResult<usize> {
177 lazy_static! {
178 static ref PARENTHESIS: Regex = Regex::new(&format!(
179 "([{}])|([{}])",
180 OPEN_PARENTHESIS, CLOSE_PARENTHESIS
181 ))
182 .unwrap();
183 }
184 let mut level: usize = 0;
185 for caps in PARENTHESIS.captures_iter(s) {
186 if caps?.get(1).is_some() {
187 level += 1;
189 } else {
190 level = level.saturating_sub(1);
191 }
192 }
193 Ok(level)
194}
195
196fn prohibited_bos(s: &str) -> SudachiResult<usize> {
198 lazy_static! {
199 static ref PROHIBITED_BOS: Regex = Regex::new(&format!(
200 "\\A([{}{}{}])+",
201 CLOSE_PARENTHESIS, COMMA, PERIODS
202 ))
203 .unwrap();
204 }
205
206 if let Some(mat) = PROHIBITED_BOS.find(s)? {
207 Ok(mat.end())
208 } else {
209 Ok(0)
210 }
211}
212
213fn is_continuous_phrase(s: &str, eos: usize) -> SudachiResult<bool> {
215 lazy_static! {
216 static ref QUOTE_MARKER: Regex = Regex::new(&format!(
217 "(!|?|\\!|\\?|[{}])(と|っ|です)",
218 CLOSE_PARENTHESIS
219 ))
220 .unwrap();
221 static ref EOS_ITEMIZE_HEADER: Regex =
222 Regex::new(&format!("([{}])([{}])\\z", ALPHABET_OR_NUMBER, DOT)).unwrap();
223 }
224
225 let last_char_len = s[..eos].chars().last().unwrap().to_string().len();
227 if let Some(mat) = QUOTE_MARKER.find(&s[(eos - last_char_len)..])? {
228 if mat.start() == 0 {
229 return Ok(true);
230 }
231 }
232
233 let c = s[eos..].chars().next().unwrap();
235 Ok((c == 'と' || c == 'や' || c == 'の') && EOS_ITEMIZE_HEADER.is_match(&s[..eos])?)
236}
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241
242 #[test]
243 fn get_eos() {
244 let sd = SentenceDetector::new();
245 assert_eq!(sd.get_eos("あいうえお。", None).unwrap(), 18);
246 assert_eq!(sd.get_eos("あいう。えお。", None).unwrap(), 12);
247 assert_eq!(sd.get_eos("あいう。。えお。", None).unwrap(), 15);
248 assert_eq!(sd.get_eos("あいうえお", None).unwrap(), -15);
249 assert_eq!(sd.get_eos("あいう えお。", None).unwrap(), 19);
250 assert_eq!(sd.get_eos("あいう えお", None).unwrap(), -16);
251 assert_eq!(sd.get_eos("", None).unwrap(), 0);
252 }
253
254 #[test]
255 fn get_eos_with_limit() {
256 let sd = SentenceDetector::with_limit(5);
257 assert_eq!(sd.get_eos("あいうえおか。", None).unwrap(), -15);
258 assert_eq!(sd.get_eos("あい。うえお。", None).unwrap(), 9);
259 assert_eq!(sd.get_eos("あいうえ", None).unwrap(), -12);
260 assert_eq!(sd.get_eos("あい うえお", None).unwrap(), -7);
261 assert_eq!(sd.get_eos("あ い うえお", None).unwrap(), -8);
262 }
263
264 #[test]
265 fn get_eos_with_period() {
266 let sd = SentenceDetector::new();
267 assert_eq!(sd.get_eos("あいう.えお", None).unwrap(), 10);
268 assert_eq!(sd.get_eos("3.141", None).unwrap(), -5);
269 assert_eq!(sd.get_eos("四百十.〇", None).unwrap(), -13);
270 }
271
272 #[test]
273 fn get_eos_with_many_periods() {
274 let sd = SentenceDetector::new();
275 assert_eq!(sd.get_eos("あいうえお!??", None).unwrap(), 18);
276 }
277
278 #[test]
279 fn get_eos_with_parentheses() {
280 let sd = SentenceDetector::new();
281 assert_eq!(sd.get_eos("あ(いう。え)お", None).unwrap(), -24);
282 assert_eq!(sd.get_eos("(あ(いう)。え)お", None).unwrap(), -30);
283 assert_eq!(sd.get_eos("あ(いう)。えお", None).unwrap(), 18);
284 }
285
286 #[test]
287 fn get_eos_with_itemize_header() {
288 let sd = SentenceDetector::new();
289 assert_eq!(sd.get_eos("1. あいう。えお", None).unwrap(), 15);
290 }
291
292 #[test]
293 fn get_eos_with_prohibited_bos() {
294 let sd = SentenceDetector::new();
295 assert_eq!(sd.get_eos("あいう?えお", None).unwrap(), 10);
296 assert_eq!(sd.get_eos("あいう?)えお", None).unwrap(), 11);
297 assert_eq!(sd.get_eos("あいう?,えお", None).unwrap(), 11);
298 }
299
300 #[test]
301 fn get_eos_with_continuous_phrase() {
302 let sd = SentenceDetector::new();
303 assert_eq!(sd.get_eos("あいう?です。", None).unwrap(), 19);
304 assert_eq!(sd.get_eos("あいう?って。", None).unwrap(), 19);
305 assert_eq!(sd.get_eos("あいう?という。", None).unwrap(), 22);
306 assert_eq!(sd.get_eos("あいう?の?です。", None).unwrap(), 10);
307
308 assert_eq!(sd.get_eos("1.と2.が。", None).unwrap(), 13);
309 assert_eq!(sd.get_eos("1.やb.から。", None).unwrap(), 16);
310 assert_eq!(sd.get_eos("1.の12.が。", None).unwrap(), 14);
311 }
312}