sudachi/plugin/input_text/default_input_text/
mod.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::collections::{HashMap, HashSet};
18use std::fs;
19use std::io::{BufRead, BufReader};
20use std::path::PathBuf;
21
22use aho_corasick::{
23    AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, Anchored, MatchKind, StartKind,
24};
25use serde::Deserialize;
26use serde_json::Value;
27use unicode_normalization::{is_nfkc_quick, IsNormalized, UnicodeNormalization};
28
29use crate::config::{Config, ConfigError};
30use crate::dic::grammar::Grammar;
31use crate::hash::RoMu;
32use crate::input_text::{InputBuffer, InputEditor};
33use crate::plugin::input_text::InputTextPlugin;
34use crate::prelude::*;
35
36#[cfg(test)]
37mod tests;
38
39const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def";
40const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def");
41
42/// Provides basic normalization of the input text
43#[derive(Default)]
44pub struct DefaultInputTextPlugin {
45    /// Set of characters to skip normalization
46    ignore_normalize_set: HashSet<char, RoMu>,
47    /// Mapping from a character to the maximum char_length of possible replacement
48    key_lengths: HashMap<char, usize>,
49    /// Replacement mapping
50    replace_char_map: HashMap<String, String>,
51    /// Checks whether the string contains symbols to normalize
52    checker: Option<AhoCorasick>,
53    replacements: Vec<String>,
54}
55
56/// Struct corresponds with raw config json file.
57#[allow(non_snake_case)]
58#[derive(Deserialize)]
59struct PluginSettings {
60    rewriteDef: Option<PathBuf>,
61}
62
63impl DefaultInputTextPlugin {
64    /// Loads rewrite definition
65    ///
66    /// Definition syntax:
67    ///     Ignored normalize:
68    ///         Each line contains a character
69    ///     Replace char list:
70    ///         Each line contains two strings separated by white spaces
71    ///         Plugin replaces the first by the second
72    ///         Same target string cannot be defined multiple times
73    ///     Empty or line starts with "#" will be ignored
74    fn read_rewrite_lists<T: BufRead>(&mut self, reader: T) -> SudachiResult<()> {
75        let mut ignore_normalize_set = HashSet::with_hasher(RoMu::new());
76        let mut key_lengths = HashMap::new();
77        let mut replace_char_map = HashMap::new();
78        for (i, line) in reader.lines().enumerate() {
79            let line = line?;
80            let line = line.trim();
81            if line.is_empty() || line.starts_with('#') {
82                continue;
83            }
84            let cols: Vec<_> = line.split_whitespace().collect();
85
86            // ignored normalize list
87            if cols.len() == 1 {
88                if cols[0].chars().count() != 1 {
89                    return Err(SudachiError::InvalidDataFormat(
90                        i,
91                        format!("{} is not character", cols[0]),
92                    ));
93                }
94                ignore_normalize_set.insert(cols[0].chars().next().unwrap());
95                continue;
96            }
97            // replace char list
98            if cols.len() == 2 {
99                if replace_char_map.contains_key(cols[0]) {
100                    return Err(SudachiError::InvalidDataFormat(
101                        i,
102                        format!("{} is already defined", cols[0]),
103                    ));
104                }
105                let first_char = cols[0].chars().next().unwrap();
106                let n_char = cols[0].chars().count();
107                if key_lengths.get(&first_char).copied().unwrap_or(0) < n_char {
108                    key_lengths.insert(first_char, n_char);
109                }
110                replace_char_map.insert(cols[0].to_string(), cols[1].to_string());
111                continue;
112            }
113            return Err(SudachiError::InvalidDataFormat(i, "".to_string()));
114        }
115
116        self.ignore_normalize_set = ignore_normalize_set;
117        self.key_lengths = key_lengths;
118        self.replace_char_map = replace_char_map;
119
120        let mut values: Vec<String> = Vec::new();
121        let mut keys: Vec<String> = Vec::new();
122
123        for (k, v) in self.replace_char_map.iter() {
124            keys.push(k.clone());
125            values.push(v.clone());
126        }
127
128        self.checker = Some(
129            AhoCorasickBuilder::new()
130                .kind(Some(AhoCorasickKind::DFA))
131                .match_kind(MatchKind::LeftmostLongest)
132                .start_kind(StartKind::Both)
133                .build(keys.clone())
134                .map_err(|e| {
135                    ConfigError::InvalidFormat(format!("failed to parse rewrite.def: {e:?}"))
136                })?,
137        );
138
139        self.replacements = values;
140
141        Ok(())
142    }
143
144    #[inline]
145    fn should_ignore(&self, ch: char) -> bool {
146        self.ignore_normalize_set.contains(&ch)
147    }
148
149    /// Fast case: lowercasing is not needed and the string is already in NFKC
150    /// Use AhoCorasick automaton to find all replacements and replace them
151    ///
152    /// Ignores are not used here, forced replacements have higher priority
153    /// Fast version does not need to walk every character!
154    fn replace_fast<'a>(
155        &'a self,
156        buffer: &InputBuffer,
157        mut replacer: InputEditor<'a>,
158    ) -> SudachiResult<InputEditor<'a>> {
159        let cur = buffer.current();
160        let checker = self.checker.as_ref().unwrap();
161
162        let ac_input = aho_corasick::Input::new(cur).anchored(Anchored::No);
163
164        for m in checker.find_iter(ac_input) {
165            let replacement = self.replacements[m.pattern()].as_str();
166            replacer.replace_ref(m.start()..m.end(), replacement);
167        }
168
169        Ok(replacer)
170    }
171
172    /// Slow case: need to handle lowercasing or NFKC normalization
173    /// Slow version needs to walk every character
174    fn replace_slow<'a>(
175        &'a self,
176        buffer: &InputBuffer,
177        mut replacer: InputEditor<'a>,
178    ) -> SudachiResult<InputEditor<'a>> {
179        let cur = buffer.current();
180        let checker = self.checker.as_ref().unwrap();
181        let mut min_offset = 0;
182
183        let mut ac_input = aho_corasick::Input::new(cur)
184            .anchored(Anchored::Yes)
185            .earliest(true);
186
187        for (offset, ch) in cur.char_indices() {
188            if offset < min_offset {
189                continue;
190            }
191            ac_input.set_start(offset);
192            // 1. replacement as defined by char.def
193            if let Some(m) = checker.find(ac_input.clone()) {
194                let range = m.range();
195                let replacement = self.replacements[m.pattern()].as_str();
196                min_offset = range.end;
197                replacer.replace_ref(range, replacement);
198                continue;
199            }
200
201            // 2. handle normalization
202            let need_lowercase = ch.is_uppercase();
203            let need_nkfc =
204                !self.should_ignore(ch) && is_nfkc_quick(std::iter::once(ch)) != IsNormalized::Yes;
205
206            // iterator types are incompatible, so calls can't be moved outside branches
207            match (need_lowercase, need_nkfc) {
208                //no need to do anything
209                (false, false) => continue,
210                // only lowercasing
211                (true, false) => {
212                    let chars = ch.to_lowercase();
213                    self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
214                }
215                // only normalization
216                (false, true) => {
217                    let chars = std::iter::once(ch).nfkc();
218                    self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
219                }
220                // both
221                (true, true) => {
222                    let chars = ch.to_lowercase().nfkc();
223                    self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
224                }
225            }
226        }
227        Ok(replacer)
228    }
229
230    fn handle_normalization_slow<'a, I: Iterator<Item = char>>(
231        &'a self,
232        mut data: I,
233        replacer: &mut InputEditor<'a>,
234        start: usize,
235        len: usize,
236        ch: char,
237    ) {
238        if let Some(ch2) = data.next() {
239            if ch2 != ch {
240                replacer.replace_char_iter(start..start + len, ch2, data)
241            }
242        }
243    }
244}
245
246impl InputTextPlugin for DefaultInputTextPlugin {
247    fn set_up(
248        &mut self,
249        settings: &Value,
250        config: &Config,
251        _grammar: &Grammar,
252    ) -> SudachiResult<()> {
253        let settings: PluginSettings = serde_json::from_value(settings.clone())?;
254
255        let rewrite_file_path = config.complete_path(
256            settings
257                .rewriteDef
258                .unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()),
259        );
260
261        if rewrite_file_path.is_ok() {
262            let reader = BufReader::new(fs::File::open(rewrite_file_path?)?);
263            self.read_rewrite_lists(reader)?;
264        } else {
265            let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES);
266            self.read_rewrite_lists(reader)?;
267        }
268
269        Ok(())
270    }
271
272    fn uses_chars(&self) -> bool {
273        true
274    }
275
276    fn rewrite_impl<'a>(
277        &'a self,
278        buffer: &InputBuffer,
279        edit: InputEditor<'a>,
280    ) -> SudachiResult<InputEditor<'a>> {
281        let chars = buffer.current_chars();
282        let need_nkfc = is_nfkc_quick(chars.iter().cloned()) != IsNormalized::Yes;
283
284        let need_lowercase = chars.iter().any(|c| c.is_uppercase());
285
286        if need_nkfc || need_lowercase {
287            self.replace_slow(buffer, edit)
288        } else {
289            self.replace_fast(buffer, edit)
290        }
291    }
292}