sudachi/dic/build/
lexicon.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use std::borrow::{Borrow, Cow};
18use std::fmt::{Debug, Formatter};
19use std::fs::File;
20use std::io::Write;
21use std::path::Path;
22
23use csv::{StringRecord, Trim};
24use indexmap::map::IndexMap;
25use indexmap::Equivalent;
26use memmap2::Mmap;
27
28use crate::analysis::Mode;
29use crate::dic::build::error::{BuildFailure, DicCompilationCtx, DicWriteResult};
30use crate::dic::build::parse::{
31    it_next, none_if_equal, parse_dic_form, parse_i16, parse_mode, parse_slash_list,
32    parse_u32_list, parse_wordid, parse_wordid_list, unescape, unescape_cow, WORD_ID_LITERAL,
33};
34use crate::dic::build::primitives::{write_u32_array, Utf16Writer};
35use crate::dic::build::report::{ReportBuilder, Reporter};
36use crate::dic::build::MAX_POS_IDS;
37use crate::dic::grammar::Grammar;
38use crate::dic::word_id::WordId;
39use crate::dic::POS_DEPTH;
40use crate::error::SudachiResult;
41
42#[cfg(test)]
43mod test;
44
45#[cfg(test)]
46mod wordinfo_test;
47
48#[derive(Hash, Eq, PartialEq)]
49pub struct StrPosEntry {
50    data: [Cow<'static, str>; POS_DEPTH],
51}
52
53impl<'a> Borrow<[Cow<'a, str>; POS_DEPTH]> for StrPosEntry {
54    fn borrow(&self) -> &[Cow<'a, str>; POS_DEPTH] {
55        &self.data
56    }
57}
58
59impl<'a> Equivalent<[Cow<'a, str>; POS_DEPTH]> for StrPosEntry {
60    fn equivalent(&self, key: &[Cow<'_, str>; POS_DEPTH]) -> bool {
61        self.data.eq(key)
62    }
63}
64
65impl StrPosEntry {
66    /// owning means 'static
67    fn rewrap(data: Cow<str>) -> Cow<'static, str> {
68        match data {
69            Cow::Borrowed(b) => Cow::Owned(b.to_owned()),
70            Cow::Owned(s) => Cow::Owned(s),
71        }
72    }
73
74    pub fn new(data: [Cow<str>; POS_DEPTH]) -> Self {
75        let [d1, d2, d3, d4, d5, d6] = data;
76        let owned: [Cow<'static, str>; POS_DEPTH] = [
77            Self::rewrap(d1),
78            Self::rewrap(d2),
79            Self::rewrap(d3),
80            Self::rewrap(d4),
81            Self::rewrap(d5),
82            Self::rewrap(d6),
83        ];
84        Self { data: owned }
85    }
86
87    pub fn from_built_pos(data: &Vec<String>) -> Self {
88        let mut iter = data.iter().map(|x| x.as_str());
89        let p1 = Cow::Borrowed(iter.next().unwrap());
90        let p2 = Cow::Borrowed(iter.next().unwrap());
91        let p3 = Cow::Borrowed(iter.next().unwrap());
92        let p4 = Cow::Borrowed(iter.next().unwrap());
93        let p5 = Cow::Borrowed(iter.next().unwrap());
94        let p6 = Cow::Borrowed(iter.next().unwrap());
95        Self::new([p1, p2, p3, p4, p5, p6])
96    }
97
98    pub fn fields(&self) -> &[Cow<'static, str>; 6] {
99        &self.data
100    }
101}
102
103impl Debug for StrPosEntry {
104    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
105        write!(
106            f,
107            "{},{},{},{},{},{}",
108            self.data[0], self.data[1], self.data[2], self.data[3], self.data[4], self.data[5]
109        )
110    }
111}
112
113#[derive(PartialEq, Eq, Debug)]
114pub(crate) enum SplitUnit {
115    Ref(WordId),
116    Inline {
117        surface: String,
118        pos: u16,
119        reading: Option<String>,
120    },
121}
122
123impl SplitUnit {
124    pub fn format(&self, lexicon: &LexiconReader) -> String {
125        match self {
126            SplitUnit::Ref(id) => id.as_raw().to_string(),
127            SplitUnit::Inline {
128                surface,
129                pos,
130                reading,
131            } => format!(
132                "{},{:?},{}",
133                surface,
134                lexicon.pos_obj(*pos).unwrap(),
135                reading.as_ref().unwrap_or(surface)
136            ),
137        }
138    }
139}
140
141pub(crate) trait SplitUnitResolver {
142    fn resolve(&self, unit: &SplitUnit) -> Option<WordId> {
143        match unit {
144            SplitUnit::Ref(wid) => Some(*wid),
145            SplitUnit::Inline {
146                surface,
147                pos,
148                reading,
149            } => self.resolve_inline(surface, *pos, reading.as_deref()),
150        }
151    }
152
153    fn resolve_inline(&self, surface: &str, pos: u16, reading: Option<&str>) -> Option<WordId>;
154}
155
156pub(crate) struct RawLexiconEntry {
157    pub left_id: i16,
158    pub right_id: i16,
159    pub cost: i16,
160    pub surface: String,
161    pub headword: Option<String>,
162    pub dic_form: WordId,
163    pub norm_form: Option<String>,
164    pub pos: u16,
165    pub splits_a: Vec<SplitUnit>,
166    pub splits_b: Vec<SplitUnit>,
167    pub reading: Option<String>,
168    #[allow(unused)]
169    pub splitting: Mode,
170    pub word_structure: Vec<WordId>,
171    pub synonym_groups: Vec<u32>,
172}
173
174impl RawLexiconEntry {
175    pub fn surface(&self) -> &str {
176        &self.surface
177    }
178
179    pub fn headword(&self) -> &str {
180        self.headword.as_deref().unwrap_or_else(|| self.surface())
181    }
182
183    pub fn norm_form(&self) -> &str {
184        self.norm_form.as_deref().unwrap_or_else(|| self.headword())
185    }
186
187    pub fn reading(&self) -> &str {
188        self.reading.as_deref().unwrap_or_else(|| self.headword())
189    }
190
191    pub fn should_index(&self) -> bool {
192        self.left_id >= 0
193    }
194
195    pub fn write_params<W: Write>(&self, w: &mut W) -> DicWriteResult<usize> {
196        w.write_all(&self.left_id.to_le_bytes())?;
197        w.write_all(&self.right_id.to_le_bytes())?;
198        w.write_all(&self.cost.to_le_bytes())?;
199        Ok(6)
200    }
201
202    pub fn write_word_info<W: Write>(
203        &self,
204        u16w: &mut Utf16Writer,
205        w: &mut W,
206    ) -> DicWriteResult<usize> {
207        let mut size = 0;
208
209        size += u16w.write(w, self.headword())?; // surface of WordInfo
210        size += u16w.write_len(w, self.surface.len())?; // surface for trie
211        w.write_all(&self.pos.to_le_bytes())?;
212        size += 2;
213        size += u16w.write_empty_if_equal(w, self.norm_form(), self.headword())?;
214        w.write_all(&self.dic_form.as_raw().to_le_bytes())?;
215        size += 4;
216        size += u16w.write_empty_if_equal(w, self.reading(), self.headword())?;
217        size += write_u32_array(w, &self.splits_a)?;
218        size += write_u32_array(w, &self.splits_b)?;
219        size += write_u32_array(w, &self.word_structure)?;
220        size += write_u32_array(w, &self.synonym_groups)?;
221
222        Ok(size)
223    }
224}
225
226pub struct LexiconReader {
227    pos: IndexMap<StrPosEntry, u16>,
228    ctx: DicCompilationCtx,
229    entries: Vec<RawLexiconEntry>,
230    unresolved: usize,
231    start_pos: usize,
232    max_left: i16,
233    max_right: i16,
234    num_system: usize,
235}
236
237impl LexiconReader {
238    pub fn new() -> Self {
239        Self {
240            pos: IndexMap::new(),
241            ctx: DicCompilationCtx::default(),
242            entries: Vec::new(),
243            unresolved: 0,
244            start_pos: 0,
245            max_left: i16::MAX,
246            max_right: i16::MAX,
247            num_system: usize::MAX,
248        }
249    }
250
251    pub(crate) fn entries(&self) -> &[RawLexiconEntry] {
252        &self.entries
253    }
254
255    pub fn needs_split_resolution(&self) -> bool {
256        self.unresolved > 0
257    }
258
259    pub fn set_max_conn_sizes(&mut self, left: i16, right: i16) {
260        self.max_left = left;
261        self.max_right = right;
262    }
263
264    pub fn set_num_system_words(&mut self, num: usize) {
265        self.num_system = num;
266    }
267
268    pub fn preload_pos(&mut self, grammar: &Grammar) {
269        assert_eq!(self.pos.len(), 0);
270        for (i, pos) in grammar.pos_list.iter().enumerate() {
271            let key = StrPosEntry::from_built_pos(pos);
272            self.pos.insert(key, i as u16);
273        }
274        self.start_pos = self.pos.len();
275    }
276
277    pub(crate) fn pos_obj(&self, pos_id: u16) -> Option<&StrPosEntry> {
278        self.pos.get_index(pos_id as usize).map(|(k, v)| {
279            assert_eq!(v, &pos_id);
280            k
281        })
282    }
283
284    pub fn read_file(&mut self, path: &Path) -> SudachiResult<usize> {
285        let file = File::open(path)?;
286        let map = unsafe { Mmap::map(&file) }?;
287        let filename = path.to_str().unwrap_or("<invalid-utf8>").to_owned();
288        let old_name = self.ctx.set_filename(filename);
289        let res = self.read_bytes(&map);
290        self.ctx.set_filename(old_name);
291        res
292    }
293
294    pub fn read_bytes(&mut self, data: &[u8]) -> SudachiResult<usize> {
295        let mut reader = csv::ReaderBuilder::new()
296            .has_headers(false)
297            .trim(Trim::None)
298            .flexible(true)
299            .from_reader(data);
300        let mut nread = 0;
301        for record in reader.records() {
302            match record {
303                Ok(r) => {
304                    let line = r.position().map_or(0, |p| p.line()) as usize;
305                    self.ctx.set_line(line);
306                    self.read_record(&r)?;
307                    nread += 1;
308                }
309                Err(e) => {
310                    let line = e.position().map_or(0, |p| p.line()) as usize;
311                    self.ctx.set_line(line);
312                    return Err(self.ctx.to_sudachi_err(BuildFailure::CsvError(e)));
313                }
314            }
315        }
316        Ok(nread)
317    }
318
319    fn read_record(&mut self, data: &StringRecord) -> SudachiResult<()> {
320        self.parse_record(data).map(|r| self.entries.push(r))
321    }
322
323    fn parse_record(&mut self, data: &StringRecord) -> SudachiResult<RawLexiconEntry> {
324        let ctx = std::mem::take(&mut self.ctx);
325        let rec = RecordWrapper { record: data, ctx };
326        let surface = rec.get(0, "(0) surface", unescape)?;
327        let left_id = rec.get(1, "(1) left_id", parse_i16)?;
328        let right_id = rec.get(2, "(2) right_id", parse_i16)?;
329        let cost = rec.get(3, "(3) cost", parse_i16)?;
330
331        let headword = rec.get(4, "(4) headword", unescape_cow)?;
332
333        let p1 = rec.get(5, "(5) pos-1", unescape_cow)?;
334        let p2 = rec.get(6, "(6) pos-2", unescape_cow)?;
335        let p3 = rec.get(7, "(7) pos-3", unescape_cow)?;
336        let p4 = rec.get(8, "(8) pos-4", unescape_cow)?;
337        let p5 = rec.get(9, "(9) pos-conj-1", unescape_cow)?;
338        let p6 = rec.get(10, "(10) pos-conj-2", unescape_cow)?;
339
340        let reading = rec.get(11, "(11) reading", unescape_cow)?;
341        let normalized = rec.get(12, "(12) normalized", unescape_cow)?;
342        let dic_form_id = rec.get(13, "(13) dic-form", parse_dic_form)?;
343        let splitting = rec.get(14, "(14) splitting", parse_mode)?;
344        let (split_a, resolve_a) = rec.get(15, "(15) split-a", |s| self.parse_splits(s))?;
345        let (split_b, resolve_b) = rec.get(16, "(16) split-b", |s| self.parse_splits(s))?;
346        let parts = rec.get(17, "(17) word-structure", parse_wordid_list)?;
347        let synonyms = rec.get_or_default(18, "(18) synonym-group", parse_u32_list)?;
348
349        let pos = rec.ctx.transform(self.pos_of([p1, p2, p3, p4, p5, p6]))?;
350
351        if splitting == Mode::A && (!split_a.is_empty() || !split_b.is_empty()) {
352            return rec.ctx.err(BuildFailure::InvalidSplit(
353                "A-mode tokens can't have splits".to_owned(),
354            ));
355        }
356
357        self.unresolved += resolve_a + resolve_b;
358
359        if surface.is_empty() {
360            return rec.ctx.err(BuildFailure::EmptySurface);
361        }
362
363        self.ctx = rec.ctx;
364
365        let entry = RawLexiconEntry {
366            left_id,
367            right_id,
368            cost,
369            dic_form: dic_form_id,
370            norm_form: none_if_equal(&headword, normalized),
371            reading: none_if_equal(&headword, reading),
372            headword: none_if_equal(&surface, headword),
373            surface,
374            pos,
375            splitting,
376            splits_a: split_a,
377            splits_b: split_b,
378            word_structure: parts,
379            synonym_groups: synonyms,
380        };
381
382        Ok(entry)
383    }
384
385    fn pos_of(&mut self, data: [Cow<str>; POS_DEPTH]) -> DicWriteResult<u16> {
386        match self.pos.get(&data) {
387            Some(pos) => Ok(*pos),
388            None => {
389                let key = StrPosEntry::new(data);
390                let pos_id = self.pos.len();
391                if pos_id > MAX_POS_IDS {
392                    Err(BuildFailure::PosLimitExceeded(format!("{:?}", key)))
393                } else {
394                    let pos_id = pos_id as u16;
395                    self.pos.insert(key, pos_id);
396                    Ok(pos_id)
397                }
398            }
399        }
400    }
401
402    pub fn validate_entries(&self) -> SudachiResult<()> {
403        let mut ctx = DicCompilationCtx::default();
404        ctx.set_filename("<entry id>".to_owned());
405        ctx.set_line(0);
406        let (max_0, max_1) = match self.num_system {
407            // means that we compile system dictionary, there must not be user words
408            usize::MAX => (self.entries.len(), 0),
409            // compiling user dictionary
410            x => (x, self.entries.len()),
411        };
412        for e in self.entries.iter() {
413            if e.left_id >= self.max_left {
414                return ctx.err(BuildFailure::InvalidFieldSize {
415                    actual: e.left_id as _,
416                    expected: self.max_left as _,
417                    field: "left_id",
418                });
419            }
420
421            if e.right_id >= self.max_right {
422                return ctx.err(BuildFailure::InvalidFieldSize {
423                    actual: e.right_id as _,
424                    expected: self.max_right as _,
425                    field: "right_id",
426                });
427            }
428
429            if e.dic_form != WordId::INVALID {
430                ctx.transform(Self::validate_wid(e.dic_form, max_0, max_1, "dic_form"))?;
431            }
432
433            for s in e.splits_a.iter() {
434                match s {
435                    SplitUnit::Ref(wid) => {
436                        ctx.transform(Self::validate_wid(*wid, max_0, max_1, "splits_a"))?;
437                    }
438                    _ => panic!("at this point there must not be unresolved splits"),
439                }
440            }
441
442            for s in e.splits_b.iter() {
443                match s {
444                    SplitUnit::Ref(wid) => {
445                        ctx.transform(Self::validate_wid(*wid, max_0, max_1, "splits_b"))?;
446                    }
447                    _ => panic!("at this point there must not be unresolved splits"),
448                }
449            }
450
451            for wid in e.word_structure.iter() {
452                ctx.transform(Self::validate_wid(*wid, max_0, max_1, "word_structure"))?;
453            }
454
455            ctx.add_line(1);
456        }
457        Ok(())
458    }
459
460    fn validate_wid(
461        wid: WordId,
462        dic0_max: usize,
463        dic1_max: usize,
464        label: &'static str,
465    ) -> DicWriteResult<()> {
466        let max = match wid.dic() {
467            0 => dic0_max,
468            1 => dic1_max,
469            x => panic!("invalid dictionary ID={}, should not happen", x),
470        };
471        if wid.word() >= max as u32 {
472            return Err(BuildFailure::InvalidFieldSize {
473                actual: wid.word() as _,
474                expected: max,
475                field: label,
476            });
477        }
478        Ok(())
479    }
480
481    fn parse_splits(&mut self, data: &str) -> DicWriteResult<(Vec<SplitUnit>, usize)> {
482        if data.is_empty() || data == "*" {
483            return Ok((Vec::new(), 0));
484        }
485
486        parse_slash_list(data, |s| self.parse_split(s)).map(|splits| {
487            let unresolved = splits
488                .iter()
489                .map(|s| match s {
490                    SplitUnit::Inline { .. } => 1,
491                    _ => 0,
492                })
493                .sum();
494            (splits, unresolved)
495        })
496    }
497
498    fn parse_split(&mut self, data: &str) -> DicWriteResult<SplitUnit> {
499        if WORD_ID_LITERAL.is_match(data) {
500            Ok(SplitUnit::Ref(parse_wordid(data)?))
501        } else {
502            let mut iter = data.splitn(8, ',');
503            let surface = it_next(data, &mut iter, "(1) surface", unescape)?;
504            let p1 = it_next(data, &mut iter, "(2) pos-1", unescape_cow)?;
505            let p2 = it_next(data, &mut iter, "(3) pos-2", unescape_cow)?;
506            let p3 = it_next(data, &mut iter, "(4) pos-3", unescape_cow)?;
507            let p4 = it_next(data, &mut iter, "(5) pos-4", unescape_cow)?;
508            let p5 = it_next(data, &mut iter, "(6) pos-conj-1", unescape_cow)?;
509            let p6 = it_next(data, &mut iter, "(7) pos-conj-2", unescape_cow)?;
510            let reading = it_next(data, &mut iter, "(8) surface", unescape_cow)?;
511
512            let pos = self.pos_of([p1, p2, p3, p4, p5, p6])?;
513            Ok(SplitUnit::Inline {
514                pos,
515                reading: none_if_equal(&surface, reading),
516                surface,
517            })
518        }
519    }
520
521    pub fn write_pos_table<W: Write>(&self, w: &mut W) -> SudachiResult<usize> {
522        let mut u16w = Utf16Writer::new();
523        let real_count = self.pos.len() - self.start_pos;
524        w.write_all(&u16::to_le_bytes(real_count as u16))?;
525        let mut written_bytes = 2;
526        let mut ctx = DicCompilationCtx::default();
527        ctx.set_filename("<pos-table>".to_owned());
528        for (row, pos_id) in self.pos.iter() {
529            if (*pos_id as usize) < self.start_pos {
530                continue;
531            }
532            for field in row.fields() {
533                ctx.apply(|| u16w.write(w, field).map(|written| written_bytes += written))?;
534            }
535            ctx.add_line(1);
536        }
537        Ok(written_bytes)
538    }
539
540    //noinspection DuplicatedCode
541    pub(crate) fn resolve_splits<R: SplitUnitResolver>(
542        &mut self,
543        resolver: &R,
544    ) -> Result<usize, (String, usize)> {
545        let mut total = 0;
546        for (line, e) in self.entries.iter_mut().enumerate() {
547            for s in e.splits_a.iter_mut() {
548                match Self::resolve_split(s, resolver) {
549                    Some(val) => total += val,
550                    None => {
551                        // at this point s is a read only borrow,
552                        // but borrow checker does not allow to do this cleanly
553                        // self conflicts with splits_a borrow
554                        let s: &SplitUnit = unsafe { std::mem::transmute(&*s) };
555                        let split_info = s.format(self);
556                        return Err((split_info, line));
557                    }
558                }
559            }
560            for s in e.splits_b.iter_mut() {
561                match Self::resolve_split(s, resolver) {
562                    Some(val) => total += val,
563                    None => {
564                        // at this point s is a read only borrow,
565                        // but borrow checker does not allow to do this cleanly
566                        // self conflicts with splits_b borrow
567                        let s: &SplitUnit = unsafe { std::mem::transmute(&*s) };
568                        let split_info = s.format(self);
569                        return Err((split_info, line));
570                    }
571                }
572            }
573        }
574        Ok(total)
575    }
576
577    fn resolve_split<R: SplitUnitResolver>(unit: &mut SplitUnit, resolver: &R) -> Option<usize> {
578        match unit {
579            SplitUnit::Ref(_) => Some(0),
580            _ => {
581                let wid = resolver.resolve(&*unit)?;
582                *unit = SplitUnit::Ref(wid);
583                Some(1)
584            }
585        }
586    }
587}
588
589struct RecordWrapper<'a> {
590    pub record: &'a StringRecord,
591    pub ctx: DicCompilationCtx,
592}
593
594impl<'a> RecordWrapper<'a> {
595    #[inline(always)]
596    fn get<T, F>(&self, idx: usize, name: &'static str, f: F) -> SudachiResult<T>
597    where
598        F: FnOnce(&'a str) -> DicWriteResult<T>,
599    {
600        match self.record.get(idx) {
601            Some(s) => self.ctx.transform(f(s)),
602            None => self.ctx.err(BuildFailure::NoRawField(name)),
603        }
604    }
605
606    #[inline(always)]
607    fn get_or_default<T, F>(&self, idx: usize, _name: &'static str, f: F) -> SudachiResult<T>
608    where
609        F: FnOnce(&'a str) -> DicWriteResult<T>,
610        T: Default,
611    {
612        match self.record.get(idx) {
613            Some(s) => self.ctx.transform(f(s)),
614            None => Ok(<T as Default>::default()),
615        }
616    }
617}
618
619pub struct LexiconWriter<'a> {
620    entries: &'a [RawLexiconEntry],
621    u16: Utf16Writer,
622    buffer: Vec<u8>,
623    offset: usize,
624    reporter: &'a mut Reporter,
625}
626
627impl<'a> LexiconWriter<'a> {
628    pub(crate) fn new(
629        entries: &'a [RawLexiconEntry],
630        offset: usize,
631        reporter: &'a mut Reporter,
632    ) -> Self {
633        Self {
634            buffer: Vec::with_capacity(entries.len() * 32),
635            entries,
636            u16: Utf16Writer::new(),
637            offset,
638            reporter,
639        }
640    }
641
642    pub fn write<W: Write>(&mut self, w: &mut W) -> SudachiResult<usize> {
643        let mut ctx = DicCompilationCtx::memory();
644        ctx.set_filename("<write entries>".to_owned());
645        let mut total = 4;
646
647        let num_entries = self.entries.len() as u32;
648        w.write_all(&num_entries.to_le_bytes())?;
649
650        let rep = ReportBuilder::new("word_params");
651        ctx.set_line(0);
652        for e in self.entries {
653            total += ctx.transform(e.write_params(w))?;
654            ctx.add_line(1);
655        }
656        self.reporter.collect(total, rep);
657        let start = total;
658
659        let rep = ReportBuilder::new("wordinfo_offsets");
660        ctx.set_line(0);
661        let offset_base = self.offset + (6 + 4) * self.entries.len() + 4;
662        let mut word_offset = 0;
663        for e in self.entries {
664            let u32_offset = (offset_base + word_offset) as u32;
665            w.write_all(&u32_offset.to_le_bytes())?;
666            let size = ctx.transform(e.write_word_info(&mut self.u16, &mut self.buffer))?;
667            word_offset += size;
668            total += 4;
669            ctx.add_line(1);
670        }
671        self.reporter.collect(total - start, rep);
672
673        let rep = ReportBuilder::new("wordinfos (copy only)");
674        let info_size = self.buffer.len();
675        w.write_all(&self.buffer)?;
676        self.reporter.collect(info_size, rep);
677
678        Ok(total + info_size)
679    }
680}