sudachi/analysis/
mod.rs

1/*
2 *  Copyright (c) 2021 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use std::fmt::{Display, Formatter};
18use std::str::FromStr;
19
20use mlist::MorphemeList;
21
22use crate::error::SudachiResult;
23
24pub mod created;
25mod inner;
26pub mod lattice;
27pub mod mlist;
28pub mod morpheme;
29pub mod node;
30pub mod stateful_tokenizer;
31pub mod stateless_tokenizer;
32
33pub use inner::Node;
34
35/// Unit to split text
36///
37/// Some examples:
38/// ```text
39/// A:選挙/管理/委員/会
40/// B:選挙/管理/委員会
41/// C:選挙管理委員会
42///
43/// A:客室/乗務/員
44/// B:客室/乗務員
45/// C:客室乗務員
46///
47/// A:労働/者/協同/組合
48/// B:労働者/協同/組合
49/// C:労働者協同組合
50///
51/// A:機能/性/食品
52/// B:機能性/食品
53/// C:機能性食品
54/// ```
55///
56/// See [Sudachi documentation](https://github.com/WorksApplications/Sudachi#the-modes-of-splitting)
57/// for more details
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum Mode {
60    /// Short
61    A,
62
63    /// Middle (similar to "word")
64    B,
65
66    /// Named Entity
67    C,
68}
69
70impl FromStr for Mode {
71    type Err = &'static str;
72
73    fn from_str(s: &str) -> Result<Self, Self::Err> {
74        match s {
75            "A" | "a" => Ok(Mode::A),
76            "B" | "b" => Ok(Mode::B),
77            "C" | "c" => Ok(Mode::C),
78            _ => Err("Mode must be one of \"A\", \"B\", or \"C\" (in lower or upper case)."),
79        }
80    }
81}
82
83impl Display for Mode {
84    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
85        let repr = match self {
86            Mode::A => "A",
87            Mode::B => "B",
88            Mode::C => "C",
89        };
90        f.write_str(repr)
91    }
92}
93
94/// Able to tokenize Japanese text
95pub trait Tokenize {
96    type Dictionary;
97
98    /// Break text into `Morpheme`s
99    fn tokenize(
100        &self,
101        input: &str,
102        mode: Mode,
103        enable_debug: bool,
104    ) -> SudachiResult<MorphemeList<Self::Dictionary>>;
105}