sudachi/analysis/
stateless_tokenizer.rs1use crate::analysis::node::ResultNode;
18use crate::analysis::stateful_tokenizer::StatefulTokenizer;
19use std::ops::Deref;
20
21use crate::dic::grammar::Grammar;
22use crate::dic::lexicon_set::LexiconSet;
23use crate::dic::subset::InfoSubset;
24use crate::error::SudachiResult;
25use crate::input_text::InputBuffer;
26use crate::plugin::input_text::InputTextPlugin;
27use crate::plugin::oov::OovProviderPlugin;
28use crate::plugin::path_rewrite::PathRewritePlugin;
29
30use super::mlist::MorphemeList;
31use super::{Mode, Tokenize};
32
33pub trait DictionaryAccess {
35 fn grammar(&self) -> &Grammar<'_>;
36 fn lexicon(&self) -> &LexiconSet<'_>;
37 fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>];
38 fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>];
39 fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>];
40}
41
42impl<T> DictionaryAccess for T
43where
44 T: Deref,
45 <T as Deref>::Target: DictionaryAccess,
46{
47 fn grammar(&self) -> &Grammar<'_> {
48 <T as Deref>::deref(self).grammar()
49 }
50
51 fn lexicon(&self) -> &LexiconSet<'_> {
52 <T as Deref>::deref(self).lexicon()
53 }
54
55 fn input_text_plugins(&self) -> &[Box<dyn InputTextPlugin + Sync + Send>] {
56 <T as Deref>::deref(self).input_text_plugins()
57 }
58
59 fn oov_provider_plugins(&self) -> &[Box<dyn OovProviderPlugin + Sync + Send>] {
60 <T as Deref>::deref(self).oov_provider_plugins()
61 }
62
63 fn path_rewrite_plugins(&self) -> &[Box<dyn PathRewritePlugin + Sync + Send>] {
64 <T as Deref>::deref(self).path_rewrite_plugins()
65 }
66}
67
68pub struct StatelessTokenizer<T> {
73 dict: T,
74}
75
76impl<T: DictionaryAccess> StatelessTokenizer<T> {
77 pub fn new(dict: T) -> StatelessTokenizer<T> {
78 StatelessTokenizer { dict }
79 }
80}
81
82impl<T> StatelessTokenizer<T>
83where
84 T: Deref,
85 <T as Deref>::Target: DictionaryAccess,
86{
87 pub fn as_dict(&self) -> &<T as Deref>::Target {
88 return Deref::deref(&self.dict);
89 }
90}
91
92impl<T> Tokenize for StatelessTokenizer<T>
93where
94 T: DictionaryAccess + Clone,
95{
96 type Dictionary = T;
97
98 fn tokenize<'a>(
99 &'a self,
100 input: &'a str,
101 mode: Mode,
102 enable_debug: bool,
103 ) -> SudachiResult<MorphemeList<Self::Dictionary>> {
104 let mut tok = StatefulTokenizer::create(self.dict.clone(), enable_debug, mode);
105 tok.reset().push_str(input);
106 tok.do_tokenize()?;
107 tok.into_morpheme_list()
108 }
109}
110
111pub(super) fn split_path<T: DictionaryAccess + ?Sized>(
112 dict: &T,
113 path: Vec<ResultNode>,
114 mode: Mode,
115 subset: InfoSubset,
116 input: &InputBuffer,
117) -> SudachiResult<Vec<ResultNode>> {
118 if mode == Mode::C {
119 return Ok(path);
120 }
121
122 let mut new_path = Vec::with_capacity(path.len() * 3 / 2);
123 for node in path {
124 let split_len = node.num_splits(mode);
125 if split_len <= 1 {
126 new_path.push(node);
127 } else {
128 new_path.extend(node.split(mode, dict.lexicon(), subset, input));
129 }
130 }
131
132 Ok(new_path)
133}
134
135pub(super) fn dump_path(path: &Vec<ResultNode>) {
136 for (i, node) in path.iter().enumerate() {
137 println!("{}: {}", i, node);
138 }
139}