sudachi/dic/build/
resolve.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::analysis::stateless_tokenizer::DictionaryAccess;
18use crate::dic::build::lexicon::{RawLexiconEntry, SplitUnitResolver};
19use crate::dic::lexicon::word_infos::WordInfoData;
20use crate::dic::subset::InfoSubset;
21use crate::dic::word_id::WordId;
22use crate::error::SudachiResult;
23use crate::util::fxhash::FxBuildHasher;
24use std::collections::HashMap;
25
26// HashMap from surface to (pos_id, reading_form, word-id)s
27type ResolutionCandidateMap<T> = HashMap<T, Vec<(u16, Option<T>, WordId)>, FxBuildHasher>;
28
29/// We can't use trie to resolve splits because it is possible that refs are not in trie
30/// This resolver has to be owning because the dictionary content is lazily loaded and transient
31pub struct BinDictResolver {
32    index: ResolutionCandidateMap<String>,
33}
34
35impl BinDictResolver {
36    pub fn new<D: DictionaryAccess>(dict: D) -> SudachiResult<Self> {
37        let lex = dict.lexicon();
38        let size = lex.size();
39        let mut index: ResolutionCandidateMap<String> = HashMap::default();
40        for id in 0..size {
41            let wid = WordId::new(0, id);
42            let winfo: WordInfoData = lex
43                .get_word_info_subset(
44                    wid,
45                    InfoSubset::SURFACE | InfoSubset::READING_FORM | InfoSubset::POS_ID,
46                )?
47                .into();
48            let surface = winfo.surface;
49            let reading = winfo.reading_form;
50            let pos_id = winfo.pos_id;
51
52            let rdfield = if reading.is_empty() || surface == reading {
53                None
54            } else {
55                Some(reading)
56            };
57
58            index
59                .entry(surface)
60                .or_default()
61                .push((pos_id, rdfield, wid));
62        }
63
64        Ok(Self { index })
65    }
66}
67
68impl SplitUnitResolver for BinDictResolver {
69    fn resolve_inline(&self, surface: &str, pos: u16, reading: Option<&str>) -> Option<WordId> {
70        self.index.get(surface).and_then(|v| {
71            for (p, rd, wid) in v {
72                if *p == pos && reading.eq(&rd.as_deref()) {
73                    return Some(*wid);
74                }
75            }
76            None
77        })
78    }
79}
80
81pub struct RawDictResolver<'a> {
82    data: ResolutionCandidateMap<&'a str>,
83}
84
85impl<'a> RawDictResolver<'a> {
86    pub(crate) fn new(entries: &'a [RawLexiconEntry], user: bool) -> Self {
87        let mut data: ResolutionCandidateMap<&'a str> = HashMap::default();
88
89        let dic_id = if user { 1 } else { 0 };
90
91        for (i, e) in entries.iter().enumerate() {
92            let surface: &'a str = e.surface();
93            let reading: &'a str = e.reading();
94            let wid = WordId::new(dic_id, i as u32);
95
96            let read_opt = if surface == reading {
97                None
98            } else {
99                Some(reading)
100            };
101
102            data.entry(surface)
103                .or_default()
104                .push((e.pos, read_opt, wid));
105        }
106
107        Self { data }
108    }
109}
110
111impl SplitUnitResolver for RawDictResolver<'_> {
112    fn resolve_inline(&self, surface: &str, pos: u16, reading: Option<&str>) -> Option<WordId> {
113        self.data.get(surface).and_then(|data| {
114            for (p, rd, wid) in data {
115                if *p == pos && *rd == reading {
116                    return Some(*wid);
117                }
118            }
119            None
120        })
121    }
122}
123
124pub(crate) struct ChainedResolver<A, B> {
125    a: A,
126    b: B,
127}
128
129impl<A: SplitUnitResolver, B: SplitUnitResolver> ChainedResolver<A, B> {
130    pub(crate) fn new(a: A, b: B) -> Self {
131        Self { a, b }
132    }
133}
134
135impl<A: SplitUnitResolver, B: SplitUnitResolver> SplitUnitResolver for ChainedResolver<A, B> {
136    fn resolve_inline(&self, surface: &str, pos: u16, reading: Option<&str>) -> Option<WordId> {
137        self.a
138            .resolve_inline(surface, pos, reading)
139            .or_else(|| self.b.resolve_inline(surface, pos, reading))
140    }
141}