sudachi/analysis/mod.rs
1/*
2 * Copyright (c) 2021 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::fmt::{Display, Formatter};
18use std::str::FromStr;
19
20use mlist::MorphemeList;
21
22use crate::error::SudachiResult;
23
24pub mod created;
25mod inner;
26pub mod lattice;
27pub mod mlist;
28pub mod morpheme;
29pub mod node;
30pub mod stateful_tokenizer;
31pub mod stateless_tokenizer;
32
33pub use inner::Node;
34
35/// Unit to split text
36///
37/// Some examples:
38/// ```text
39/// A:選挙/管理/委員/会
40/// B:選挙/管理/委員会
41/// C:選挙管理委員会
42///
43/// A:客室/乗務/員
44/// B:客室/乗務員
45/// C:客室乗務員
46///
47/// A:労働/者/協同/組合
48/// B:労働者/協同/組合
49/// C:労働者協同組合
50///
51/// A:機能/性/食品
52/// B:機能性/食品
53/// C:機能性食品
54/// ```
55///
56/// See [Sudachi documentation](https://github.com/WorksApplications/Sudachi#the-modes-of-splitting)
57/// for more details
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum Mode {
60 /// Short
61 A,
62
63 /// Middle (similar to "word")
64 B,
65
66 /// Named Entity
67 C,
68}
69
70impl FromStr for Mode {
71 type Err = &'static str;
72
73 fn from_str(s: &str) -> Result<Self, Self::Err> {
74 match s {
75 "A" | "a" => Ok(Mode::A),
76 "B" | "b" => Ok(Mode::B),
77 "C" | "c" => Ok(Mode::C),
78 _ => Err("Mode must be one of \"A\", \"B\", or \"C\" (in lower or upper case)."),
79 }
80 }
81}
82
83impl Display for Mode {
84 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
85 let repr = match self {
86 Mode::A => "A",
87 Mode::B => "B",
88 Mode::C => "C",
89 };
90 f.write_str(repr)
91 }
92}
93
94/// Able to tokenize Japanese text
95pub trait Tokenize {
96 type Dictionary;
97
98 /// Break text into `Morpheme`s
99 fn tokenize(
100 &self,
101 input: &str,
102 mode: Mode,
103 enable_debug: bool,
104 ) -> SudachiResult<MorphemeList<Self::Dictionary>>;
105}