1use std::convert::TryFrom;
18use std::env::current_exe;
19use std::fs::File;
20use std::io::BufReader;
21use std::path::{Path, PathBuf};
22
23use crate::dic::subset::InfoSubset;
24use crate::error::SudachiError;
25use lazy_static::lazy_static;
26use serde::Deserialize;
27use serde_json::Value;
28use thiserror::Error;
29
30const DEFAULT_RESOURCE_DIR: &str = "resources";
31const DEFAULT_SETTING_FILE: &str = "sudachi.json";
32const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json");
33const DEFAULT_CHAR_DEF_FILE: &str = "char.def";
34
35#[derive(Error, Debug)]
37pub enum ConfigError {
38 #[error("IO Error: {0}")]
39 Io(#[from] std::io::Error),
40
41 #[error("Serde error: {0}")]
42 SerdeError(#[from] serde_json::Error),
43
44 #[error("Config file not found")]
45 FileNotFound(String),
46
47 #[error("Invalid format: {0}")]
48 InvalidFormat(String),
49
50 #[error("Argument {0} is missing")]
51 MissingArgument(String),
52
53 #[error("Failed to resolve relative path {0}, tried: {1:?}")]
54 PathResolution(String, Vec<String>),
55}
56
57#[derive(Default, Debug, Clone)]
58struct PathResolver {
59 roots: Vec<PathBuf>,
60}
61
62impl PathResolver {
63 fn with_capacity(capacity: usize) -> PathResolver {
64 PathResolver {
65 roots: Vec::with_capacity(capacity),
66 }
67 }
68
69 fn add<P: Into<PathBuf>>(&mut self, path: P) {
70 self.roots.push(path.into())
71 }
72
73 fn contains<P: AsRef<Path>>(&self, path: P) -> bool {
74 let query = path.as_ref();
75 return self.roots.iter().any(|p| p.as_path() == query);
76 }
77
78 pub fn first_existing<P: AsRef<Path> + Clone>(&self, path: P) -> Option<PathBuf> {
79 self.all_candidates(path).find(|p| p.exists())
80 }
81
82 pub fn resolution_failure<P: AsRef<Path> + Clone>(&self, path: P) -> ConfigError {
83 let candidates = self
84 .all_candidates(path.clone())
85 .map(|p| p.to_string_lossy().into_owned())
86 .collect();
87
88 ConfigError::PathResolution(path.as_ref().to_string_lossy().into_owned(), candidates)
89 }
90
91 pub fn all_candidates<'a, P: AsRef<Path> + Clone + 'a>(
92 &'a self,
93 path: P,
94 ) -> impl Iterator<Item = PathBuf> + 'a {
95 self.roots.iter().map(move |root| root.join(path.clone()))
96 }
97
98 pub fn roots(&self) -> &[PathBuf] {
99 &self.roots
100 }
101}
102
103#[derive(Deserialize, Clone, Copy, Debug, Eq, PartialEq, Default)]
104#[serde(rename_all = "snake_case")]
105pub enum SurfaceProjection {
106 #[default]
107 Surface,
108 Normalized,
109 Reading,
110 Dictionary,
111 DictionaryAndSurface,
112 NormalizedAndSurface,
113 NormalizedNouns,
114}
115
116impl SurfaceProjection {
117 pub fn required_subset(&self) -> InfoSubset {
119 match *self {
120 SurfaceProjection::Surface => InfoSubset::empty(),
121 SurfaceProjection::Normalized => InfoSubset::NORMALIZED_FORM,
122 SurfaceProjection::Reading => InfoSubset::READING_FORM,
123 SurfaceProjection::Dictionary => InfoSubset::DIC_FORM_WORD_ID,
124 SurfaceProjection::DictionaryAndSurface => InfoSubset::DIC_FORM_WORD_ID,
125 SurfaceProjection::NormalizedAndSurface => InfoSubset::NORMALIZED_FORM,
126 SurfaceProjection::NormalizedNouns => InfoSubset::NORMALIZED_FORM,
127 }
128 }
129}
130
131impl TryFrom<&str> for SurfaceProjection {
132 type Error = SudachiError;
133
134 fn try_from(value: &str) -> Result<Self, Self::Error> {
135 match value {
136 "surface" => Ok(SurfaceProjection::Surface),
137 "normalized" => Ok(SurfaceProjection::Normalized),
138 "reading" => Ok(SurfaceProjection::Reading),
139 "dictionary" => Ok(SurfaceProjection::Dictionary),
140 "dictionary_and_surface" => Ok(SurfaceProjection::DictionaryAndSurface),
141 "normalized_and_surface" => Ok(SurfaceProjection::NormalizedAndSurface),
142 "normalized_nouns" => Ok(SurfaceProjection::NormalizedNouns),
143 _ => Err(ConfigError::InvalidFormat(format!("unknown projection: {value}")).into()),
144 }
145 }
146}
147
148#[derive(Debug, Default, Clone)]
150pub struct Config {
151 resolver: PathResolver,
153 pub system_dict: Option<PathBuf>,
154 pub user_dicts: Vec<PathBuf>,
155 pub character_definition_file: PathBuf,
156
157 pub connection_cost_plugins: Vec<Value>,
158 pub input_text_plugins: Vec<Value>,
159 pub oov_provider_plugins: Vec<Value>,
160 pub path_rewrite_plugins: Vec<Value>,
161 pub projection: SurfaceProjection,
163}
164
165#[allow(non_snake_case)]
169#[derive(Deserialize, Debug, Clone)]
170pub struct ConfigBuilder {
171 path: Option<PathBuf>,
173 #[serde(skip)]
175 resourcePath: Option<PathBuf>,
176 #[serde(skip)]
179 rootDirectory: Option<PathBuf>,
180 #[serde(alias = "system")]
181 systemDict: Option<PathBuf>,
182 #[serde(alias = "user")]
183 userDict: Option<Vec<PathBuf>>,
184 characterDefinitionFile: Option<PathBuf>,
185 connectionCostPlugin: Option<Vec<Value>>,
186 inputTextPlugin: Option<Vec<Value>>,
187 oovProviderPlugin: Option<Vec<Value>>,
188 pathRewritePlugin: Option<Vec<Value>>,
189 projection: Option<SurfaceProjection>,
190}
191
192pub fn default_resource_dir() -> PathBuf {
193 let mut src_root_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
194 if !src_root_path.pop() {
195 src_root_path.push("..");
196 }
197 src_root_path.push(DEFAULT_RESOURCE_DIR);
198 src_root_path
199}
200
201pub fn default_config_location() -> PathBuf {
202 let mut resdir = default_resource_dir();
203 resdir.push(DEFAULT_SETTING_FILE);
204 resdir
205}
206
207macro_rules! merge_cfg_value {
208 ($base: ident, $o: ident, $name: tt) => {
209 $base.$name = $base.$name.or_else(|| $o.$name.clone())
210 };
211}
212
213impl ConfigBuilder {
214 pub fn from_opt_file(config_file: Option<&Path>) -> Result<Self, ConfigError> {
215 match config_file {
216 None => {
217 let default_config = default_config_location();
218 Self::from_file(&default_config)
219 }
220 Some(cfg) => Self::from_file(cfg),
221 }
222 }
223
224 pub fn from_file(config_file: &Path) -> Result<Self, ConfigError> {
225 let file = File::open(config_file)?;
226 let reader = BufReader::new(file);
227 serde_json::from_reader(reader)
228 .map_err(|e| e.into())
229 .map(|cfg: ConfigBuilder| match config_file.parent() {
230 Some(p) => cfg.root_directory(p),
231 None => cfg,
232 })
233 }
234
235 pub fn from_bytes(data: &[u8]) -> Result<Self, ConfigError> {
236 serde_json::from_slice(data).map_err(|e| e.into())
237 }
238
239 pub fn empty() -> Self {
240 serde_json::from_slice(b"{}").unwrap()
241 }
242
243 pub fn system_dict(mut self, dict: impl Into<PathBuf>) -> Self {
244 self.systemDict = Some(dict.into());
245 self
246 }
247
248 pub fn user_dict(mut self, dict: impl Into<PathBuf>) -> Self {
249 let dicts = match self.userDict.as_mut() {
250 None => {
251 self.userDict = Some(Default::default());
252 self.userDict.as_mut().unwrap()
253 }
254 Some(dicts) => dicts,
255 };
256 dicts.push(dict.into());
257 self
258 }
259
260 pub fn resource_path(mut self, path: impl Into<PathBuf>) -> Self {
261 self.resourcePath = Some(path.into());
262 self
263 }
264
265 pub fn root_directory(mut self, path: impl Into<PathBuf>) -> Self {
266 self.rootDirectory = Some(path.into());
267 self
268 }
269
270 pub fn build(self) -> Config {
271 let default_resource_dir = default_resource_dir();
272 let resource_dir = self.resourcePath.unwrap_or(default_resource_dir);
273
274 let mut resolver = PathResolver::with_capacity(3);
275 let mut add_path = |buf: PathBuf| {
276 if !resolver.contains(&buf) {
277 resolver.add(buf);
278 }
279 };
280 self.path.map(&mut add_path);
281 add_path(resource_dir);
282 self.rootDirectory.map(&mut add_path);
283
284 let character_definition_file = self
285 .characterDefinitionFile
286 .unwrap_or(PathBuf::from(DEFAULT_CHAR_DEF_FILE));
287
288 Config {
289 resolver,
290 system_dict: self.systemDict,
291 user_dicts: self.userDict.unwrap_or_default(),
292 character_definition_file,
293
294 connection_cost_plugins: self.connectionCostPlugin.unwrap_or_default(),
295 input_text_plugins: self.inputTextPlugin.unwrap_or_default(),
296 oov_provider_plugins: self.oovProviderPlugin.unwrap_or_default(),
297 path_rewrite_plugins: self.pathRewritePlugin.unwrap_or_default(),
298 projection: self.projection.unwrap_or(SurfaceProjection::Surface),
299 }
300 }
301
302 pub fn fallback(mut self, other: &ConfigBuilder) -> ConfigBuilder {
303 merge_cfg_value!(self, other, path);
304 merge_cfg_value!(self, other, resourcePath);
305 merge_cfg_value!(self, other, rootDirectory);
306 merge_cfg_value!(self, other, systemDict);
307 merge_cfg_value!(self, other, userDict);
308 merge_cfg_value!(self, other, characterDefinitionFile);
309 merge_cfg_value!(self, other, connectionCostPlugin);
310 merge_cfg_value!(self, other, inputTextPlugin);
311 merge_cfg_value!(self, other, oovProviderPlugin);
312 merge_cfg_value!(self, other, pathRewritePlugin);
313 merge_cfg_value!(self, other, projection);
314 self
315 }
316}
317
318impl Config {
319 pub fn new(
320 config_file: Option<PathBuf>,
321 resource_dir: Option<PathBuf>,
322 dictionary_path: Option<PathBuf>,
323 ) -> Result<Self, ConfigError> {
324 let raw_config = ConfigBuilder::from_opt_file(config_file.as_deref())?;
326
327 let raw_config = match resource_dir {
329 None => raw_config,
330 Some(p) => raw_config.resource_path(p),
331 };
332
333 let raw_config = match dictionary_path {
335 None => raw_config,
336 Some(p) => raw_config.system_dict(p),
337 };
338
339 Ok(raw_config.build())
340 }
341
342 pub fn new_embedded() -> Result<Self, ConfigError> {
343 let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?;
344
345 Ok(raw_config.build())
346 }
347
348 pub fn minimal_at(resource_dir: impl Into<PathBuf>) -> Config {
350 let mut cfg = Config::default();
351 let resource = resource_dir.into();
352 cfg.character_definition_file = resource.join(DEFAULT_CHAR_DEF_FILE);
353 let mut resolver = PathResolver::with_capacity(1);
354 resolver.add(resource);
355 cfg.resolver = resolver;
356 cfg.oov_provider_plugins = vec![serde_json::json!(
357 { "class" : "com.worksap.nlp.sudachi.SimpleOovPlugin",
358 "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
359 "leftId" : 0,
360 "rightId" : 0,
361 "cost" : 30000 }
362 )];
363 cfg
364 }
365
366 pub fn with_system_dic(mut self, system: impl Into<PathBuf>) -> Config {
368 self.system_dict = Some(system.into());
369 self
370 }
371
372 pub fn resolve_paths(&self, mut path: String) -> Vec<String> {
373 if path.starts_with("$exe") {
374 path.replace_range(0..4, &CURRENT_EXE_DIR);
375
376 let mut path2 = path.clone();
377 path2.insert_str(CURRENT_EXE_DIR.len(), "/deps");
378 return vec![path2, path];
379 }
380
381 if path.starts_with("$cfg/") || path.starts_with("$cfg\\") {
382 let roots = self.resolver.roots();
383 let mut result = Vec::with_capacity(roots.len());
384 path.replace_range(0..5, "");
385 for root in roots {
386 let subpath = root.join(&path);
387 result.push(subpath.to_string_lossy().into_owned());
388 }
389 return result;
390 }
391
392 vec![path]
393 }
394
395 pub fn complete_path<P: AsRef<Path> + Into<PathBuf>>(
401 &self,
402 file_path: P,
403 ) -> Result<PathBuf, ConfigError> {
404 let pref = file_path.as_ref();
405 if pref.is_absolute() {
407 return Ok(file_path.into());
408 }
409
410 if let Some(p) = self.resolver.first_existing(pref) {
412 return Ok(p);
413 }
414
415 if pref.exists() {
417 return Ok(file_path.into());
418 }
419
420 Err(self.resolver.resolution_failure(&file_path))
422 }
423
424 pub fn resolved_system_dict(&self) -> Result<PathBuf, ConfigError> {
425 match self.system_dict.as_ref() {
426 Some(p) => self.complete_path(p),
427 None => Err(ConfigError::MissingArgument("systemDict".to_owned())),
428 }
429 }
430
431 pub fn resolved_user_dicts(&self) -> Result<Vec<PathBuf>, ConfigError> {
432 self.user_dicts
433 .iter()
434 .map(|p| self.complete_path(p))
435 .collect()
436 }
437}
438
439fn current_exe_dir() -> String {
440 let exe = current_exe().unwrap_or_else(|e| panic!("Current exe is not available {:?}", e));
441
442 let parent = exe
443 .parent()
444 .unwrap_or_else(|| panic!("Path to executable must have a parent"));
445
446 parent.to_str().map(|s| s.to_owned()).unwrap_or_else(|| {
447 panic!("placing Sudachi in directories with non-utf paths is not supported")
448 })
449}
450
451lazy_static! {
452 static ref CURRENT_EXE_DIR: String = current_exe_dir();
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458 use crate::prelude::SudachiResult;
459
460 use super::CURRENT_EXE_DIR;
461
462 #[test]
463 fn resolve_exe() -> SudachiResult<()> {
464 let cfg = Config::new(None, None, None)?;
465 let npath = cfg.resolve_paths("$exe/data".to_owned());
466 let exe_dir: &str = &CURRENT_EXE_DIR;
467 assert_eq!(npath.len(), 2);
468 assert!(npath[0].starts_with(exe_dir));
469 Ok(())
470 }
471
472 #[test]
473 fn resolve_cfg() -> SudachiResult<()> {
474 let cfg = Config::new(None, None, None)?;
475 let npath = cfg.resolve_paths("$cfg/data".to_owned());
476 let def = default_resource_dir();
477 let path_dir: &str = def.to_str().unwrap();
478 assert_eq!(1, npath.len());
479 assert!(npath[0].starts_with(path_dir));
480 Ok(())
481 }
482
483 #[test]
484 fn config_builder_fallback() {
485 let mut cfg = ConfigBuilder::empty();
486 cfg.path = Some("test".into());
487 let cfg2 = ConfigBuilder::empty();
488 let cfg2 = cfg2.fallback(&cfg);
489 assert_eq!(cfg2.path, Some("test".into()));
490 }
491
492 #[test]
493 fn surface_projection_tryfrom() {
494 assert_eq!(
495 SurfaceProjection::Surface,
496 SurfaceProjection::try_from("surface").unwrap()
497 );
498 }
499}