sudachi/plugin/input_text/default_input_text/
mod.rs1use std::collections::{HashMap, HashSet};
18use std::fs;
19use std::io::{BufRead, BufReader};
20use std::path::PathBuf;
21
22use aho_corasick::{
23 AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, Anchored, MatchKind, StartKind,
24};
25use serde::Deserialize;
26use serde_json::Value;
27use unicode_normalization::{is_nfkc_quick, IsNormalized, UnicodeNormalization};
28
29use crate::config::{Config, ConfigError};
30use crate::dic::grammar::Grammar;
31use crate::hash::RoMu;
32use crate::input_text::{InputBuffer, InputEditor};
33use crate::plugin::input_text::InputTextPlugin;
34use crate::prelude::*;
35
36#[cfg(test)]
37mod tests;
38
39const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def";
40const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def");
41
42#[derive(Default)]
44pub struct DefaultInputTextPlugin {
45 ignore_normalize_set: HashSet<char, RoMu>,
47 key_lengths: HashMap<char, usize>,
49 replace_char_map: HashMap<String, String>,
51 checker: Option<AhoCorasick>,
53 replacements: Vec<String>,
54}
55
56#[allow(non_snake_case)]
58#[derive(Deserialize)]
59struct PluginSettings {
60 rewriteDef: Option<PathBuf>,
61}
62
63impl DefaultInputTextPlugin {
64 fn read_rewrite_lists<T: BufRead>(&mut self, reader: T) -> SudachiResult<()> {
75 let mut ignore_normalize_set = HashSet::with_hasher(RoMu::new());
76 let mut key_lengths = HashMap::new();
77 let mut replace_char_map = HashMap::new();
78 for (i, line) in reader.lines().enumerate() {
79 let line = line?;
80 let line = line.trim();
81 if line.is_empty() || line.starts_with('#') {
82 continue;
83 }
84 let cols: Vec<_> = line.split_whitespace().collect();
85
86 if cols.len() == 1 {
88 if cols[0].chars().count() != 1 {
89 return Err(SudachiError::InvalidDataFormat(
90 i,
91 format!("{} is not character", cols[0]),
92 ));
93 }
94 ignore_normalize_set.insert(cols[0].chars().next().unwrap());
95 continue;
96 }
97 if cols.len() == 2 {
99 if replace_char_map.contains_key(cols[0]) {
100 return Err(SudachiError::InvalidDataFormat(
101 i,
102 format!("{} is already defined", cols[0]),
103 ));
104 }
105 let first_char = cols[0].chars().next().unwrap();
106 let n_char = cols[0].chars().count();
107 if key_lengths.get(&first_char).copied().unwrap_or(0) < n_char {
108 key_lengths.insert(first_char, n_char);
109 }
110 replace_char_map.insert(cols[0].to_string(), cols[1].to_string());
111 continue;
112 }
113 return Err(SudachiError::InvalidDataFormat(i, "".to_string()));
114 }
115
116 self.ignore_normalize_set = ignore_normalize_set;
117 self.key_lengths = key_lengths;
118 self.replace_char_map = replace_char_map;
119
120 let mut values: Vec<String> = Vec::new();
121 let mut keys: Vec<String> = Vec::new();
122
123 for (k, v) in self.replace_char_map.iter() {
124 keys.push(k.clone());
125 values.push(v.clone());
126 }
127
128 self.checker = Some(
129 AhoCorasickBuilder::new()
130 .kind(Some(AhoCorasickKind::DFA))
131 .match_kind(MatchKind::LeftmostLongest)
132 .start_kind(StartKind::Both)
133 .build(keys.clone())
134 .map_err(|e| {
135 ConfigError::InvalidFormat(format!("failed to parse rewrite.def: {e:?}"))
136 })?,
137 );
138
139 self.replacements = values;
140
141 Ok(())
142 }
143
144 #[inline]
145 fn should_ignore(&self, ch: char) -> bool {
146 self.ignore_normalize_set.contains(&ch)
147 }
148
149 fn replace_fast<'a>(
155 &'a self,
156 buffer: &InputBuffer,
157 mut replacer: InputEditor<'a>,
158 ) -> SudachiResult<InputEditor<'a>> {
159 let cur = buffer.current();
160 let checker = self.checker.as_ref().unwrap();
161
162 let ac_input = aho_corasick::Input::new(cur).anchored(Anchored::No);
163
164 for m in checker.find_iter(ac_input) {
165 let replacement = self.replacements[m.pattern()].as_str();
166 replacer.replace_ref(m.start()..m.end(), replacement);
167 }
168
169 Ok(replacer)
170 }
171
172 fn replace_slow<'a>(
175 &'a self,
176 buffer: &InputBuffer,
177 mut replacer: InputEditor<'a>,
178 ) -> SudachiResult<InputEditor<'a>> {
179 let cur = buffer.current();
180 let checker = self.checker.as_ref().unwrap();
181 let mut min_offset = 0;
182
183 let mut ac_input = aho_corasick::Input::new(cur)
184 .anchored(Anchored::Yes)
185 .earliest(true);
186
187 for (offset, ch) in cur.char_indices() {
188 if offset < min_offset {
189 continue;
190 }
191 ac_input.set_start(offset);
192 if let Some(m) = checker.find(ac_input.clone()) {
194 let range = m.range();
195 let replacement = self.replacements[m.pattern()].as_str();
196 min_offset = range.end;
197 replacer.replace_ref(range, replacement);
198 continue;
199 }
200
201 let need_lowercase = ch.is_uppercase();
203 let need_nkfc =
204 !self.should_ignore(ch) && is_nfkc_quick(std::iter::once(ch)) != IsNormalized::Yes;
205
206 match (need_lowercase, need_nkfc) {
208 (false, false) => continue,
210 (true, false) => {
212 let chars = ch.to_lowercase();
213 self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
214 }
215 (false, true) => {
217 let chars = std::iter::once(ch).nfkc();
218 self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
219 }
220 (true, true) => {
222 let chars = ch.to_lowercase().nfkc();
223 self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
224 }
225 }
226 }
227 Ok(replacer)
228 }
229
230 fn handle_normalization_slow<'a, I: Iterator<Item = char>>(
231 &'a self,
232 mut data: I,
233 replacer: &mut InputEditor<'a>,
234 start: usize,
235 len: usize,
236 ch: char,
237 ) {
238 if let Some(ch2) = data.next() {
239 if ch2 != ch {
240 replacer.replace_char_iter(start..start + len, ch2, data)
241 }
242 }
243 }
244}
245
246impl InputTextPlugin for DefaultInputTextPlugin {
247 fn set_up(
248 &mut self,
249 settings: &Value,
250 config: &Config,
251 _grammar: &Grammar,
252 ) -> SudachiResult<()> {
253 let settings: PluginSettings = serde_json::from_value(settings.clone())?;
254
255 let rewrite_file_path = config.complete_path(
256 settings
257 .rewriteDef
258 .unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()),
259 );
260
261 if rewrite_file_path.is_ok() {
262 let reader = BufReader::new(fs::File::open(rewrite_file_path?)?);
263 self.read_rewrite_lists(reader)?;
264 } else {
265 let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES);
266 self.read_rewrite_lists(reader)?;
267 }
268
269 Ok(())
270 }
271
272 fn uses_chars(&self) -> bool {
273 true
274 }
275
276 fn rewrite_impl<'a>(
277 &'a self,
278 buffer: &InputBuffer,
279 edit: InputEditor<'a>,
280 ) -> SudachiResult<InputEditor<'a>> {
281 let chars = buffer.current_chars();
282 let need_nkfc = is_nfkc_quick(chars.iter().cloned()) != IsNormalized::Yes;
283
284 let need_lowercase = chars.iter().any(|c| c.is_uppercase());
285
286 if need_nkfc || need_lowercase {
287 self.replace_slow(buffer, edit)
288 } else {
289 self.replace_fast(buffer, edit)
290 }
291 }
292}