sudachi/plugin/input_text/default_input_text/
mod.rsuse std::collections::{HashMap, HashSet};
use std::fs;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use aho_corasick::{
AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, Anchored, MatchKind, StartKind,
};
use serde::Deserialize;
use serde_json::Value;
use unicode_normalization::{is_nfkc_quick, IsNormalized, UnicodeNormalization};
use crate::config::{Config, ConfigError};
use crate::dic::grammar::Grammar;
use crate::hash::RoMu;
use crate::input_text::{InputBuffer, InputEditor};
use crate::plugin::input_text::InputTextPlugin;
use crate::prelude::*;
#[cfg(test)]
mod tests;
const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def";
const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def");
#[derive(Default)]
pub struct DefaultInputTextPlugin {
ignore_normalize_set: HashSet<char, RoMu>,
key_lengths: HashMap<char, usize>,
replace_char_map: HashMap<String, String>,
checker: Option<AhoCorasick>,
replacements: Vec<String>,
}
#[allow(non_snake_case)]
#[derive(Deserialize)]
struct PluginSettings {
rewriteDef: Option<PathBuf>,
}
impl DefaultInputTextPlugin {
fn read_rewrite_lists<T: BufRead>(&mut self, reader: T) -> SudachiResult<()> {
let mut ignore_normalize_set = HashSet::with_hasher(RoMu::new());
let mut key_lengths = HashMap::new();
let mut replace_char_map = HashMap::new();
for (i, line) in reader.lines().enumerate() {
let line = line?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let cols: Vec<_> = line.split_whitespace().collect();
if cols.len() == 1 {
if cols[0].chars().count() != 1 {
return Err(SudachiError::InvalidDataFormat(
i,
format!("{} is not character", cols[0]),
));
}
ignore_normalize_set.insert(cols[0].chars().next().unwrap());
continue;
}
if cols.len() == 2 {
if replace_char_map.contains_key(cols[0]) {
return Err(SudachiError::InvalidDataFormat(
i,
format!("{} is already defined", cols[0]),
));
}
let first_char = cols[0].chars().next().unwrap();
let n_char = cols[0].chars().count();
if key_lengths.get(&first_char).copied().unwrap_or(0) < n_char {
key_lengths.insert(first_char, n_char);
}
replace_char_map.insert(cols[0].to_string(), cols[1].to_string());
continue;
}
return Err(SudachiError::InvalidDataFormat(i, "".to_string()));
}
self.ignore_normalize_set = ignore_normalize_set;
self.key_lengths = key_lengths;
self.replace_char_map = replace_char_map;
let mut values: Vec<String> = Vec::new();
let mut keys: Vec<String> = Vec::new();
for (k, v) in self.replace_char_map.iter() {
keys.push(k.clone());
values.push(v.clone());
}
self.checker = Some(
AhoCorasickBuilder::new()
.kind(Some(AhoCorasickKind::DFA))
.match_kind(MatchKind::LeftmostLongest)
.start_kind(StartKind::Both)
.build(keys.clone())
.map_err(|e| {
ConfigError::InvalidFormat(format!("failed to parse rewrite.def: {e:?}"))
})?,
);
self.replacements = values;
Ok(())
}
#[inline]
fn should_ignore(&self, ch: char) -> bool {
self.ignore_normalize_set.contains(&ch)
}
fn replace_fast<'a>(
&'a self,
buffer: &InputBuffer,
mut replacer: InputEditor<'a>,
) -> SudachiResult<InputEditor<'a>> {
let cur = buffer.current();
let checker = self.checker.as_ref().unwrap();
let ac_input = aho_corasick::Input::new(cur).anchored(Anchored::No);
for m in checker.find_iter(ac_input) {
let replacement = self.replacements[m.pattern()].as_str();
replacer.replace_ref(m.start()..m.end(), replacement);
}
Ok(replacer)
}
fn replace_slow<'a>(
&'a self,
buffer: &InputBuffer,
mut replacer: InputEditor<'a>,
) -> SudachiResult<InputEditor<'a>> {
let cur = buffer.current();
let checker = self.checker.as_ref().unwrap();
let mut min_offset = 0;
let mut ac_input = aho_corasick::Input::new(cur)
.anchored(Anchored::Yes)
.earliest(true);
for (offset, ch) in cur.char_indices() {
if offset < min_offset {
continue;
}
ac_input.set_start(offset);
if let Some(m) = checker.find(ac_input.clone()) {
let range = m.range();
let replacement = self.replacements[m.pattern()].as_str();
min_offset = range.end;
replacer.replace_ref(range, replacement);
continue;
}
let need_lowercase = ch.is_uppercase();
let need_nkfc =
!self.should_ignore(ch) && is_nfkc_quick(std::iter::once(ch)) != IsNormalized::Yes;
match (need_lowercase, need_nkfc) {
(false, false) => continue,
(true, false) => {
let chars = ch.to_lowercase();
self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
}
(false, true) => {
let chars = std::iter::once(ch).nfkc();
self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
}
(true, true) => {
let chars = ch.to_lowercase().nfkc();
self.handle_normalization_slow(chars, &mut replacer, offset, ch.len_utf8(), ch)
}
}
}
Ok(replacer)
}
fn handle_normalization_slow<'a, I: Iterator<Item = char>>(
&'a self,
mut data: I,
replacer: &mut InputEditor<'a>,
start: usize,
len: usize,
ch: char,
) {
if let Some(ch2) = data.next() {
if ch2 != ch {
replacer.replace_char_iter(start..start + len, ch2, data)
}
}
}
}
impl InputTextPlugin for DefaultInputTextPlugin {
fn set_up(
&mut self,
settings: &Value,
config: &Config,
_grammar: &Grammar,
) -> SudachiResult<()> {
let settings: PluginSettings = serde_json::from_value(settings.clone())?;
let rewrite_file_path = config.complete_path(
settings
.rewriteDef
.unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()),
);
if rewrite_file_path.is_ok() {
let reader = BufReader::new(fs::File::open(rewrite_file_path?)?);
self.read_rewrite_lists(reader)?;
} else {
let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES);
self.read_rewrite_lists(reader)?;
}
Ok(())
}
fn uses_chars(&self) -> bool {
true
}
fn rewrite_impl<'a>(
&'a self,
buffer: &InputBuffer,
edit: InputEditor<'a>,
) -> SudachiResult<InputEditor<'a>> {
let chars = buffer.current_chars();
let need_nkfc = is_nfkc_quick(chars.iter().cloned()) != IsNormalized::Yes;
let need_lowercase = chars.iter().any(|c| c.is_uppercase());
if need_nkfc || need_lowercase {
self.replace_slow(buffer, edit)
} else {
self.replace_fast(buffer, edit)
}
}
}