sudachi/
config.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::convert::TryFrom;
18use std::env::current_exe;
19use std::fs::File;
20use std::io::BufReader;
21use std::path::{Path, PathBuf};
22
23use crate::dic::subset::InfoSubset;
24use crate::error::SudachiError;
25use lazy_static::lazy_static;
26use serde::Deserialize;
27use serde_json::Value;
28use thiserror::Error;
29
30const DEFAULT_RESOURCE_DIR: &str = "resources";
31const DEFAULT_SETTING_FILE: &str = "sudachi.json";
32const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json");
33const DEFAULT_CHAR_DEF_FILE: &str = "char.def";
34
35/// Sudachi Error
36#[derive(Error, Debug)]
37pub enum ConfigError {
38    #[error("IO Error: {0}")]
39    Io(#[from] std::io::Error),
40
41    #[error("Serde error: {0}")]
42    SerdeError(#[from] serde_json::Error),
43
44    #[error("Config file not found")]
45    FileNotFound(String),
46
47    #[error("Invalid format: {0}")]
48    InvalidFormat(String),
49
50    #[error("Argument {0} is missing")]
51    MissingArgument(String),
52
53    #[error("Failed to resolve relative path {0}, tried: {1:?}")]
54    PathResolution(String, Vec<String>),
55}
56
57#[derive(Default, Debug, Clone)]
58struct PathResolver {
59    roots: Vec<PathBuf>,
60}
61
62impl PathResolver {
63    fn with_capacity(capacity: usize) -> PathResolver {
64        PathResolver {
65            roots: Vec::with_capacity(capacity),
66        }
67    }
68
69    fn add<P: Into<PathBuf>>(&mut self, path: P) {
70        self.roots.push(path.into())
71    }
72
73    fn contains<P: AsRef<Path>>(&self, path: P) -> bool {
74        let query = path.as_ref();
75        return self.roots.iter().any(|p| p.as_path() == query);
76    }
77
78    pub fn first_existing<P: AsRef<Path> + Clone>(&self, path: P) -> Option<PathBuf> {
79        self.all_candidates(path).find(|p| p.exists())
80    }
81
82    pub fn resolution_failure<P: AsRef<Path> + Clone>(&self, path: P) -> ConfigError {
83        let candidates = self
84            .all_candidates(path.clone())
85            .map(|p| p.to_string_lossy().into_owned())
86            .collect();
87
88        ConfigError::PathResolution(path.as_ref().to_string_lossy().into_owned(), candidates)
89    }
90
91    pub fn all_candidates<'a, P: AsRef<Path> + Clone + 'a>(
92        &'a self,
93        path: P,
94    ) -> impl Iterator<Item = PathBuf> + 'a {
95        self.roots.iter().map(move |root| root.join(path.clone()))
96    }
97
98    pub fn roots(&self) -> &[PathBuf] {
99        &self.roots
100    }
101}
102
103#[derive(Deserialize, Clone, Copy, Debug, Eq, PartialEq, Default)]
104#[serde(rename_all = "snake_case")]
105pub enum SurfaceProjection {
106    #[default]
107    Surface,
108    Normalized,
109    Reading,
110    Dictionary,
111    DictionaryAndSurface,
112    NormalizedAndSurface,
113    NormalizedNouns,
114}
115
116impl SurfaceProjection {
117    /// Return required InfoSubset for the current projection type
118    pub fn required_subset(&self) -> InfoSubset {
119        match *self {
120            SurfaceProjection::Surface => InfoSubset::empty(),
121            SurfaceProjection::Normalized => InfoSubset::NORMALIZED_FORM,
122            SurfaceProjection::Reading => InfoSubset::READING_FORM,
123            SurfaceProjection::Dictionary => InfoSubset::DIC_FORM_WORD_ID,
124            SurfaceProjection::DictionaryAndSurface => InfoSubset::DIC_FORM_WORD_ID,
125            SurfaceProjection::NormalizedAndSurface => InfoSubset::NORMALIZED_FORM,
126            SurfaceProjection::NormalizedNouns => InfoSubset::NORMALIZED_FORM,
127        }
128    }
129}
130
131impl TryFrom<&str> for SurfaceProjection {
132    type Error = SudachiError;
133
134    fn try_from(value: &str) -> Result<Self, Self::Error> {
135        match value {
136            "surface" => Ok(SurfaceProjection::Surface),
137            "normalized" => Ok(SurfaceProjection::Normalized),
138            "reading" => Ok(SurfaceProjection::Reading),
139            "dictionary" => Ok(SurfaceProjection::Dictionary),
140            "dictionary_and_surface" => Ok(SurfaceProjection::DictionaryAndSurface),
141            "normalized_and_surface" => Ok(SurfaceProjection::NormalizedAndSurface),
142            "normalized_nouns" => Ok(SurfaceProjection::NormalizedNouns),
143            _ => Err(ConfigError::InvalidFormat(format!("unknown projection: {value}")).into()),
144        }
145    }
146}
147
148/// Setting data loaded from config file
149#[derive(Debug, Default, Clone)]
150pub struct Config {
151    /// Paths will be resolved against these roots, until a file will be found
152    resolver: PathResolver,
153    pub system_dict: Option<PathBuf>,
154    pub user_dicts: Vec<PathBuf>,
155    pub character_definition_file: PathBuf,
156
157    pub connection_cost_plugins: Vec<Value>,
158    pub input_text_plugins: Vec<Value>,
159    pub oov_provider_plugins: Vec<Value>,
160    pub path_rewrite_plugins: Vec<Value>,
161    // this option is Python-only and is ignored in Rust APIs
162    pub projection: SurfaceProjection,
163}
164
165/// Struct corresponds with raw config json file.
166/// You must use filed names defined here as json object key.
167/// For plugins, refer to each plugin.
168#[allow(non_snake_case)]
169#[derive(Deserialize, Debug, Clone)]
170pub struct ConfigBuilder {
171    /// Analogue to Java Implementation path Override    
172    path: Option<PathBuf>,
173    /// User-passed resourcePath
174    #[serde(skip)]
175    resourcePath: Option<PathBuf>,
176    /// User-passed root directory.
177    /// Is also automatically set on from_file
178    #[serde(skip)]
179    rootDirectory: Option<PathBuf>,
180    #[serde(alias = "system")]
181    systemDict: Option<PathBuf>,
182    #[serde(alias = "user")]
183    userDict: Option<Vec<PathBuf>>,
184    characterDefinitionFile: Option<PathBuf>,
185    connectionCostPlugin: Option<Vec<Value>>,
186    inputTextPlugin: Option<Vec<Value>>,
187    oovProviderPlugin: Option<Vec<Value>>,
188    pathRewritePlugin: Option<Vec<Value>>,
189    projection: Option<SurfaceProjection>,
190}
191
192pub fn default_resource_dir() -> PathBuf {
193    let mut src_root_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
194    if !src_root_path.pop() {
195        src_root_path.push("..");
196    }
197    src_root_path.push(DEFAULT_RESOURCE_DIR);
198    src_root_path
199}
200
201pub fn default_config_location() -> PathBuf {
202    let mut resdir = default_resource_dir();
203    resdir.push(DEFAULT_SETTING_FILE);
204    resdir
205}
206
207macro_rules! merge_cfg_value {
208    ($base: ident, $o: ident, $name: tt) => {
209        $base.$name = $base.$name.or_else(|| $o.$name.clone())
210    };
211}
212
213impl ConfigBuilder {
214    pub fn from_opt_file(config_file: Option<&Path>) -> Result<Self, ConfigError> {
215        match config_file {
216            None => {
217                let default_config = default_config_location();
218                Self::from_file(&default_config)
219            }
220            Some(cfg) => Self::from_file(cfg),
221        }
222    }
223
224    pub fn from_file(config_file: &Path) -> Result<Self, ConfigError> {
225        let file = File::open(config_file)?;
226        let reader = BufReader::new(file);
227        serde_json::from_reader(reader)
228            .map_err(|e| e.into())
229            .map(|cfg: ConfigBuilder| match config_file.parent() {
230                Some(p) => cfg.root_directory(p),
231                None => cfg,
232            })
233    }
234
235    pub fn from_bytes(data: &[u8]) -> Result<Self, ConfigError> {
236        serde_json::from_slice(data).map_err(|e| e.into())
237    }
238
239    pub fn empty() -> Self {
240        serde_json::from_slice(b"{}").unwrap()
241    }
242
243    pub fn system_dict(mut self, dict: impl Into<PathBuf>) -> Self {
244        self.systemDict = Some(dict.into());
245        self
246    }
247
248    pub fn user_dict(mut self, dict: impl Into<PathBuf>) -> Self {
249        let dicts = match self.userDict.as_mut() {
250            None => {
251                self.userDict = Some(Default::default());
252                self.userDict.as_mut().unwrap()
253            }
254            Some(dicts) => dicts,
255        };
256        dicts.push(dict.into());
257        self
258    }
259
260    pub fn resource_path(mut self, path: impl Into<PathBuf>) -> Self {
261        self.resourcePath = Some(path.into());
262        self
263    }
264
265    pub fn root_directory(mut self, path: impl Into<PathBuf>) -> Self {
266        self.rootDirectory = Some(path.into());
267        self
268    }
269
270    pub fn build(self) -> Config {
271        let default_resource_dir = default_resource_dir();
272        let resource_dir = self.resourcePath.unwrap_or(default_resource_dir);
273
274        let mut resolver = PathResolver::with_capacity(3);
275        let mut add_path = |buf: PathBuf| {
276            if !resolver.contains(&buf) {
277                resolver.add(buf);
278            }
279        };
280        self.path.map(&mut add_path);
281        add_path(resource_dir);
282        self.rootDirectory.map(&mut add_path);
283
284        let character_definition_file = self
285            .characterDefinitionFile
286            .unwrap_or(PathBuf::from(DEFAULT_CHAR_DEF_FILE));
287
288        Config {
289            resolver,
290            system_dict: self.systemDict,
291            user_dicts: self.userDict.unwrap_or_default(),
292            character_definition_file,
293
294            connection_cost_plugins: self.connectionCostPlugin.unwrap_or_default(),
295            input_text_plugins: self.inputTextPlugin.unwrap_or_default(),
296            oov_provider_plugins: self.oovProviderPlugin.unwrap_or_default(),
297            path_rewrite_plugins: self.pathRewritePlugin.unwrap_or_default(),
298            projection: self.projection.unwrap_or(SurfaceProjection::Surface),
299        }
300    }
301
302    pub fn fallback(mut self, other: &ConfigBuilder) -> ConfigBuilder {
303        merge_cfg_value!(self, other, path);
304        merge_cfg_value!(self, other, resourcePath);
305        merge_cfg_value!(self, other, rootDirectory);
306        merge_cfg_value!(self, other, systemDict);
307        merge_cfg_value!(self, other, userDict);
308        merge_cfg_value!(self, other, characterDefinitionFile);
309        merge_cfg_value!(self, other, connectionCostPlugin);
310        merge_cfg_value!(self, other, inputTextPlugin);
311        merge_cfg_value!(self, other, oovProviderPlugin);
312        merge_cfg_value!(self, other, pathRewritePlugin);
313        merge_cfg_value!(self, other, projection);
314        self
315    }
316}
317
318impl Config {
319    pub fn new(
320        config_file: Option<PathBuf>,
321        resource_dir: Option<PathBuf>,
322        dictionary_path: Option<PathBuf>,
323    ) -> Result<Self, ConfigError> {
324        // prioritize arg (cli option) > default
325        let raw_config = ConfigBuilder::from_opt_file(config_file.as_deref())?;
326
327        // prioritize arg (cli option) > config file
328        let raw_config = match resource_dir {
329            None => raw_config,
330            Some(p) => raw_config.resource_path(p),
331        };
332
333        // prioritize arg (cli option) > config file
334        let raw_config = match dictionary_path {
335            None => raw_config,
336            Some(p) => raw_config.system_dict(p),
337        };
338
339        Ok(raw_config.build())
340    }
341
342    pub fn new_embedded() -> Result<Self, ConfigError> {
343        let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?;
344
345        Ok(raw_config.build())
346    }
347
348    /// Creates a minimal config with the provided resource directory
349    pub fn minimal_at(resource_dir: impl Into<PathBuf>) -> Config {
350        let mut cfg = Config::default();
351        let resource = resource_dir.into();
352        cfg.character_definition_file = resource.join(DEFAULT_CHAR_DEF_FILE);
353        let mut resolver = PathResolver::with_capacity(1);
354        resolver.add(resource);
355        cfg.resolver = resolver;
356        cfg.oov_provider_plugins = vec![serde_json::json!(
357            { "class" : "com.worksap.nlp.sudachi.SimpleOovPlugin",
358              "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
359              "leftId" : 0,
360              "rightId" : 0,
361              "cost" : 30000 }
362        )];
363        cfg
364    }
365
366    /// Sets the system dictionary to the provided path
367    pub fn with_system_dic(mut self, system: impl Into<PathBuf>) -> Config {
368        self.system_dict = Some(system.into());
369        self
370    }
371
372    pub fn resolve_paths(&self, mut path: String) -> Vec<String> {
373        if path.starts_with("$exe") {
374            path.replace_range(0..4, &CURRENT_EXE_DIR);
375
376            let mut path2 = path.clone();
377            path2.insert_str(CURRENT_EXE_DIR.len(), "/deps");
378            return vec![path2, path];
379        }
380
381        if path.starts_with("$cfg/") || path.starts_with("$cfg\\") {
382            let roots = self.resolver.roots();
383            let mut result = Vec::with_capacity(roots.len());
384            path.replace_range(0..5, "");
385            for root in roots {
386                let subpath = root.join(&path);
387                result.push(subpath.to_string_lossy().into_owned());
388            }
389            return result;
390        }
391
392        vec![path]
393    }
394
395    /// Resolves a possibly relative path with regards to all possible anchors:
396    /// 1. Absolute paths stay as they are
397    /// 2. Paths are resolved wrt to anchors, returning the first existing one
398    /// 3. Path are checked wrt to CWD
399    /// 4. If all fail, return an error with all candidate paths listed
400    pub fn complete_path<P: AsRef<Path> + Into<PathBuf>>(
401        &self,
402        file_path: P,
403    ) -> Result<PathBuf, ConfigError> {
404        let pref = file_path.as_ref();
405        // 1. absolute paths are not normalized
406        if pref.is_absolute() {
407            return Ok(file_path.into());
408        }
409
410        // 2. try to resolve paths wrt anchors
411        if let Some(p) = self.resolver.first_existing(pref) {
412            return Ok(p);
413        }
414
415        // 3. try to resolve path wrt CWD
416        if pref.exists() {
417            return Ok(file_path.into());
418        }
419
420        // Report an error
421        Err(self.resolver.resolution_failure(&file_path))
422    }
423
424    pub fn resolved_system_dict(&self) -> Result<PathBuf, ConfigError> {
425        match self.system_dict.as_ref() {
426            Some(p) => self.complete_path(p),
427            None => Err(ConfigError::MissingArgument("systemDict".to_owned())),
428        }
429    }
430
431    pub fn resolved_user_dicts(&self) -> Result<Vec<PathBuf>, ConfigError> {
432        self.user_dicts
433            .iter()
434            .map(|p| self.complete_path(p))
435            .collect()
436    }
437}
438
439fn current_exe_dir() -> String {
440    let exe = current_exe().unwrap_or_else(|e| panic!("Current exe is not available {:?}", e));
441
442    let parent = exe
443        .parent()
444        .unwrap_or_else(|| panic!("Path to executable must have a parent"));
445
446    parent.to_str().map(|s| s.to_owned()).unwrap_or_else(|| {
447        panic!("placing Sudachi in directories with non-utf paths is not supported")
448    })
449}
450
451lazy_static! {
452    static ref CURRENT_EXE_DIR: String = current_exe_dir();
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458    use crate::prelude::SudachiResult;
459
460    use super::CURRENT_EXE_DIR;
461
462    #[test]
463    fn resolve_exe() -> SudachiResult<()> {
464        let cfg = Config::new(None, None, None)?;
465        let npath = cfg.resolve_paths("$exe/data".to_owned());
466        let exe_dir: &str = &CURRENT_EXE_DIR;
467        assert_eq!(npath.len(), 2);
468        assert!(npath[0].starts_with(exe_dir));
469        Ok(())
470    }
471
472    #[test]
473    fn resolve_cfg() -> SudachiResult<()> {
474        let cfg = Config::new(None, None, None)?;
475        let npath = cfg.resolve_paths("$cfg/data".to_owned());
476        let def = default_resource_dir();
477        let path_dir: &str = def.to_str().unwrap();
478        assert_eq!(1, npath.len());
479        assert!(npath[0].starts_with(path_dir));
480        Ok(())
481    }
482
483    #[test]
484    fn config_builder_fallback() {
485        let mut cfg = ConfigBuilder::empty();
486        cfg.path = Some("test".into());
487        let cfg2 = ConfigBuilder::empty();
488        let cfg2 = cfg2.fallback(&cfg);
489        assert_eq!(cfg2.path, Some("test".into()));
490    }
491
492    #[test]
493    fn surface_projection_tryfrom() {
494        assert_eq!(
495            SurfaceProjection::Surface,
496            SurfaceProjection::try_from("surface").unwrap()
497        );
498    }
499}