sudachi/analysis/
stateless_tokenizer.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::analysis::node::ResultNode;
18use crate::analysis::stateful_tokenizer::StatefulTokenizer;
19use std::ops::Deref;
20
21use crate::dic::grammar::Grammar;
22use crate::dic::lexicon_set::LexiconSet;
23use crate::dic::subset::InfoSubset;
24use crate::error::SudachiResult;
25use crate::input_text::InputBuffer;
26use crate::plugin::input_text::InputTextPlugin;
27use crate::plugin::oov::OovProviderPlugin;
28use crate::plugin::path_rewrite::PathRewritePlugin;
29
30use super::mlist::MorphemeList;
31use super::{Mode, Tokenize};
32
33/// Provides access to dictionary data
34pub trait DictionaryAccess {
35    fn grammar(&self) -> &Grammar<'_>;
36    fn lexicon(&self) -> &LexiconSet<'_>;
37    fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>];
38    fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>];
39    fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>];
40}
41
42impl<T> DictionaryAccess for T
43where
44    T: Deref,
45    <T as Deref>::Target: DictionaryAccess,
46{
47    fn grammar(&self) -> &Grammar<'_> {
48        <T as Deref>::deref(self).grammar()
49    }
50
51    fn lexicon(&self) -> &LexiconSet<'_> {
52        <T as Deref>::deref(self).lexicon()
53    }
54
55    fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
56        <T as Deref>::deref(self).input_text_plugins()
57    }
58
59    fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
60        <T as Deref>::deref(self).oov_provider_plugins()
61    }
62
63    fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
64        <T as Deref>::deref(self).path_rewrite_plugins()
65    }
66}
67
68/// Implementation of a Tokenizer which does not have tokenization state.
69///
70/// This is a wrapper which is generic over dictionary pointers.
71/// Usable where dictionary is a struct itself, &, &mut, Rc<.>, Arc<.>.
72pub struct StatelessTokenizer<T> {
73    dict: T,
74}
75
76impl<T: DictionaryAccess> StatelessTokenizer<T> {
77    pub fn new(dict: T) -> StatelessTokenizer<T> {
78        StatelessTokenizer { dict }
79    }
80}
81
82impl<T> StatelessTokenizer<T>
83where
84    T: Deref,
85    <T as Deref>::Target: DictionaryAccess,
86{
87    pub fn as_dict(&self) -> &<T as Deref>::Target {
88        return Deref::deref(&self.dict);
89    }
90}
91
92impl<T> Tokenize for StatelessTokenizer<T>
93where
94    T: DictionaryAccess + Clone,
95{
96    type Dictionary = T;
97
98    fn tokenize<'a>(
99        &'a self,
100        input: &'a str,
101        mode: Mode,
102        enable_debug: bool,
103    ) -> SudachiResult<MorphemeList<Self::Dictionary>> {
104        let mut tok = StatefulTokenizer::create(self.dict.clone(), enable_debug, mode);
105        tok.reset().push_str(input);
106        tok.do_tokenize()?;
107        tok.into_morpheme_list()
108    }
109}
110
111pub(super) fn split_path<T: DictionaryAccess + ?Sized>(
112    dict: &T,
113    path: Vec<ResultNode>,
114    mode: Mode,
115    subset: InfoSubset,
116    input: &InputBuffer,
117) -> SudachiResult<Vec<ResultNode>> {
118    if mode == Mode::C {
119        return Ok(path);
120    }
121
122    let mut new_path = Vec::with_capacity(path.len() * 3 / 2);
123    for node in path {
124        let split_len = node.num_splits(mode);
125        if split_len <= 1 {
126            new_path.push(node);
127        } else {
128            new_path.extend(node.split(mode, dict.lexicon(), subset, input));
129        }
130    }
131
132    Ok(new_path)
133}
134
135pub(super) fn dump_path(path: &Vec<ResultNode>) {
136    for (i, node) in path.iter().enumerate() {
137        println!("{}: {}", i, node);
138    }
139}