1use std::borrow::{Borrow, Cow};
18use std::fmt::{Debug, Formatter};
19use std::fs::File;
20use std::io::Write;
21use std::path::Path;
22
23use csv::{StringRecord, Trim};
24use indexmap::map::IndexMap;
25use indexmap::Equivalent;
26use memmap2::Mmap;
27
28use crate::analysis::Mode;
29use crate::dic::build::error::{BuildFailure, DicCompilationCtx, DicWriteResult};
30use crate::dic::build::parse::{
31 it_next, none_if_equal, parse_dic_form, parse_i16, parse_mode, parse_slash_list,
32 parse_u32_list, parse_wordid, parse_wordid_list, unescape, unescape_cow, WORD_ID_LITERAL,
33};
34use crate::dic::build::primitives::{write_u32_array, Utf16Writer};
35use crate::dic::build::report::{ReportBuilder, Reporter};
36use crate::dic::build::MAX_POS_IDS;
37use crate::dic::grammar::Grammar;
38use crate::dic::word_id::WordId;
39use crate::dic::POS_DEPTH;
40use crate::error::SudachiResult;
41
42#[cfg(test)]
43mod test;
44
45#[cfg(test)]
46mod wordinfo_test;
47
48#[derive(Hash, Eq, PartialEq)]
49pub struct StrPosEntry {
50 data: [Cow<'static, str>; POS_DEPTH],
51}
52
53impl<'a> Borrow<[Cow<'a, str>; POS_DEPTH]> for StrPosEntry {
54 fn borrow(&self) -> &[Cow<'a, str>; POS_DEPTH] {
55 &self.data
56 }
57}
58
59impl<'a> Equivalent<[Cow<'a, str>; POS_DEPTH]> for StrPosEntry {
60 fn equivalent(&self, key: &[Cow<'_, str>; POS_DEPTH]) -> bool {
61 self.data.eq(key)
62 }
63}
64
65impl StrPosEntry {
66 fn rewrap(data: Cow<str>) -> Cow<'static, str> {
68 match data {
69 Cow::Borrowed(b) => Cow::Owned(b.to_owned()),
70 Cow::Owned(s) => Cow::Owned(s),
71 }
72 }
73
74 pub fn new(data: [Cow<str>; POS_DEPTH]) -> Self {
75 let [d1, d2, d3, d4, d5, d6] = data;
76 let owned: [Cow<'static, str>; POS_DEPTH] = [
77 Self::rewrap(d1),
78 Self::rewrap(d2),
79 Self::rewrap(d3),
80 Self::rewrap(d4),
81 Self::rewrap(d5),
82 Self::rewrap(d6),
83 ];
84 Self { data: owned }
85 }
86
87 pub fn from_built_pos(data: &Vec<String>) -> Self {
88 let mut iter = data.iter().map(|x| x.as_str());
89 let p1 = Cow::Borrowed(iter.next().unwrap());
90 let p2 = Cow::Borrowed(iter.next().unwrap());
91 let p3 = Cow::Borrowed(iter.next().unwrap());
92 let p4 = Cow::Borrowed(iter.next().unwrap());
93 let p5 = Cow::Borrowed(iter.next().unwrap());
94 let p6 = Cow::Borrowed(iter.next().unwrap());
95 Self::new([p1, p2, p3, p4, p5, p6])
96 }
97
98 pub fn fields(&self) -> &[Cow<'static, str>; 6] {
99 &self.data
100 }
101}
102
103impl Debug for StrPosEntry {
104 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
105 write!(
106 f,
107 "{},{},{},{},{},{}",
108 self.data[0], self.data[1], self.data[2], self.data[3], self.data[4], self.data[5]
109 )
110 }
111}
112
113#[derive(PartialEq, Eq, Debug)]
114pub(crate) enum SplitUnit {
115 Ref(WordId),
116 Inline {
117 surface: String,
118 pos: u16,
119 reading: Option<String>,
120 },
121}
122
123impl SplitUnit {
124 pub fn format(&self, lexicon: &LexiconReader) -> String {
125 match self {
126 SplitUnit::Ref(id) => id.as_raw().to_string(),
127 SplitUnit::Inline {
128 surface,
129 pos,
130 reading,
131 } => format!(
132 "{},{:?},{}",
133 surface,
134 lexicon.pos_obj(*pos).unwrap(),
135 reading.as_ref().unwrap_or(surface)
136 ),
137 }
138 }
139}
140
141pub(crate) trait SplitUnitResolver {
142 fn resolve(&self, unit: &SplitUnit) -> Option<WordId> {
143 match unit {
144 SplitUnit::Ref(wid) => Some(*wid),
145 SplitUnit::Inline {
146 surface,
147 pos,
148 reading,
149 } => self.resolve_inline(surface, *pos, reading.as_deref()),
150 }
151 }
152
153 fn resolve_inline(&self, surface: &str, pos: u16, reading: Option<&str>) -> Option<WordId>;
154}
155
156pub(crate) struct RawLexiconEntry {
157 pub left_id: i16,
158 pub right_id: i16,
159 pub cost: i16,
160 pub surface: String,
161 pub headword: Option<String>,
162 pub dic_form: WordId,
163 pub norm_form: Option<String>,
164 pub pos: u16,
165 pub splits_a: Vec<SplitUnit>,
166 pub splits_b: Vec<SplitUnit>,
167 pub reading: Option<String>,
168 #[allow(unused)]
169 pub splitting: Mode,
170 pub word_structure: Vec<WordId>,
171 pub synonym_groups: Vec<u32>,
172}
173
174impl RawLexiconEntry {
175 pub fn surface(&self) -> &str {
176 &self.surface
177 }
178
179 pub fn headword(&self) -> &str {
180 self.headword.as_deref().unwrap_or_else(|| self.surface())
181 }
182
183 pub fn norm_form(&self) -> &str {
184 self.norm_form.as_deref().unwrap_or_else(|| self.headword())
185 }
186
187 pub fn reading(&self) -> &str {
188 self.reading.as_deref().unwrap_or_else(|| self.headword())
189 }
190
191 pub fn should_index(&self) -> bool {
192 self.left_id >= 0
193 }
194
195 pub fn write_params<W: Write>(&self, w: &mut W) -> DicWriteResult<usize> {
196 w.write_all(&self.left_id.to_le_bytes())?;
197 w.write_all(&self.right_id.to_le_bytes())?;
198 w.write_all(&self.cost.to_le_bytes())?;
199 Ok(6)
200 }
201
202 pub fn write_word_info<W: Write>(
203 &self,
204 u16w: &mut Utf16Writer,
205 w: &mut W,
206 ) -> DicWriteResult<usize> {
207 let mut size = 0;
208
209 size += u16w.write(w, self.headword())?; size += u16w.write_len(w, self.surface.len())?; w.write_all(&self.pos.to_le_bytes())?;
212 size += 2;
213 size += u16w.write_empty_if_equal(w, self.norm_form(), self.headword())?;
214 w.write_all(&self.dic_form.as_raw().to_le_bytes())?;
215 size += 4;
216 size += u16w.write_empty_if_equal(w, self.reading(), self.headword())?;
217 size += write_u32_array(w, &self.splits_a)?;
218 size += write_u32_array(w, &self.splits_b)?;
219 size += write_u32_array(w, &self.word_structure)?;
220 size += write_u32_array(w, &self.synonym_groups)?;
221
222 Ok(size)
223 }
224}
225
226pub struct LexiconReader {
227 pos: IndexMap<StrPosEntry, u16>,
228 ctx: DicCompilationCtx,
229 entries: Vec<RawLexiconEntry>,
230 unresolved: usize,
231 start_pos: usize,
232 max_left: i16,
233 max_right: i16,
234 num_system: usize,
235}
236
237impl LexiconReader {
238 pub fn new() -> Self {
239 Self {
240 pos: IndexMap::new(),
241 ctx: DicCompilationCtx::default(),
242 entries: Vec::new(),
243 unresolved: 0,
244 start_pos: 0,
245 max_left: i16::MAX,
246 max_right: i16::MAX,
247 num_system: usize::MAX,
248 }
249 }
250
251 pub(crate) fn entries(&self) -> &[RawLexiconEntry] {
252 &self.entries
253 }
254
255 pub fn needs_split_resolution(&self) -> bool {
256 self.unresolved > 0
257 }
258
259 pub fn set_max_conn_sizes(&mut self, left: i16, right: i16) {
260 self.max_left = left;
261 self.max_right = right;
262 }
263
264 pub fn set_num_system_words(&mut self, num: usize) {
265 self.num_system = num;
266 }
267
268 pub fn preload_pos(&mut self, grammar: &Grammar) {
269 assert_eq!(self.pos.len(), 0);
270 for (i, pos) in grammar.pos_list.iter().enumerate() {
271 let key = StrPosEntry::from_built_pos(pos);
272 self.pos.insert(key, i as u16);
273 }
274 self.start_pos = self.pos.len();
275 }
276
277 pub(crate) fn pos_obj(&self, pos_id: u16) -> Option<&StrPosEntry> {
278 self.pos.get_index(pos_id as usize).map(|(k, v)| {
279 assert_eq!(v, &pos_id);
280 k
281 })
282 }
283
284 pub fn read_file(&mut self, path: &Path) -> SudachiResult<usize> {
285 let file = File::open(path)?;
286 let map = unsafe { Mmap::map(&file) }?;
287 let filename = path.to_str().unwrap_or("<invalid-utf8>").to_owned();
288 let old_name = self.ctx.set_filename(filename);
289 let res = self.read_bytes(&map);
290 self.ctx.set_filename(old_name);
291 res
292 }
293
294 pub fn read_bytes(&mut self, data: &[u8]) -> SudachiResult<usize> {
295 let mut reader = csv::ReaderBuilder::new()
296 .has_headers(false)
297 .trim(Trim::None)
298 .flexible(true)
299 .from_reader(data);
300 let mut nread = 0;
301 for record in reader.records() {
302 match record {
303 Ok(r) => {
304 let line = r.position().map_or(0, |p| p.line()) as usize;
305 self.ctx.set_line(line);
306 self.read_record(&r)?;
307 nread += 1;
308 }
309 Err(e) => {
310 let line = e.position().map_or(0, |p| p.line()) as usize;
311 self.ctx.set_line(line);
312 return Err(self.ctx.to_sudachi_err(BuildFailure::CsvError(e)));
313 }
314 }
315 }
316 Ok(nread)
317 }
318
319 fn read_record(&mut self, data: &StringRecord) -> SudachiResult<()> {
320 self.parse_record(data).map(|r| self.entries.push(r))
321 }
322
323 fn parse_record(&mut self, data: &StringRecord) -> SudachiResult<RawLexiconEntry> {
324 let ctx = std::mem::take(&mut self.ctx);
325 let rec = RecordWrapper { record: data, ctx };
326 let surface = rec.get(0, "(0) surface", unescape)?;
327 let left_id = rec.get(1, "(1) left_id", parse_i16)?;
328 let right_id = rec.get(2, "(2) right_id", parse_i16)?;
329 let cost = rec.get(3, "(3) cost", parse_i16)?;
330
331 let headword = rec.get(4, "(4) headword", unescape_cow)?;
332
333 let p1 = rec.get(5, "(5) pos-1", unescape_cow)?;
334 let p2 = rec.get(6, "(6) pos-2", unescape_cow)?;
335 let p3 = rec.get(7, "(7) pos-3", unescape_cow)?;
336 let p4 = rec.get(8, "(8) pos-4", unescape_cow)?;
337 let p5 = rec.get(9, "(9) pos-conj-1", unescape_cow)?;
338 let p6 = rec.get(10, "(10) pos-conj-2", unescape_cow)?;
339
340 let reading = rec.get(11, "(11) reading", unescape_cow)?;
341 let normalized = rec.get(12, "(12) normalized", unescape_cow)?;
342 let dic_form_id = rec.get(13, "(13) dic-form", parse_dic_form)?;
343 let splitting = rec.get(14, "(14) splitting", parse_mode)?;
344 let (split_a, resolve_a) = rec.get(15, "(15) split-a", |s| self.parse_splits(s))?;
345 let (split_b, resolve_b) = rec.get(16, "(16) split-b", |s| self.parse_splits(s))?;
346 let parts = rec.get(17, "(17) word-structure", parse_wordid_list)?;
347 let synonyms = rec.get_or_default(18, "(18) synonym-group", parse_u32_list)?;
348
349 let pos = rec.ctx.transform(self.pos_of([p1, p2, p3, p4, p5, p6]))?;
350
351 if splitting == Mode::A && (!split_a.is_empty() || !split_b.is_empty()) {
352 return rec.ctx.err(BuildFailure::InvalidSplit(
353 "A-mode tokens can't have splits".to_owned(),
354 ));
355 }
356
357 self.unresolved += resolve_a + resolve_b;
358
359 if surface.is_empty() {
360 return rec.ctx.err(BuildFailure::EmptySurface);
361 }
362
363 self.ctx = rec.ctx;
364
365 let entry = RawLexiconEntry {
366 left_id,
367 right_id,
368 cost,
369 dic_form: dic_form_id,
370 norm_form: none_if_equal(&headword, normalized),
371 reading: none_if_equal(&headword, reading),
372 headword: none_if_equal(&surface, headword),
373 surface,
374 pos,
375 splitting,
376 splits_a: split_a,
377 splits_b: split_b,
378 word_structure: parts,
379 synonym_groups: synonyms,
380 };
381
382 Ok(entry)
383 }
384
385 fn pos_of(&mut self, data: [Cow<str>; POS_DEPTH]) -> DicWriteResult<u16> {
386 match self.pos.get(&data) {
387 Some(pos) => Ok(*pos),
388 None => {
389 let key = StrPosEntry::new(data);
390 let pos_id = self.pos.len();
391 if pos_id > MAX_POS_IDS {
392 Err(BuildFailure::PosLimitExceeded(format!("{:?}", key)))
393 } else {
394 let pos_id = pos_id as u16;
395 self.pos.insert(key, pos_id);
396 Ok(pos_id)
397 }
398 }
399 }
400 }
401
402 pub fn validate_entries(&self) -> SudachiResult<()> {
403 let mut ctx = DicCompilationCtx::default();
404 ctx.set_filename("<entry id>".to_owned());
405 ctx.set_line(0);
406 let (max_0, max_1) = match self.num_system {
407 usize::MAX => (self.entries.len(), 0),
409 x => (x, self.entries.len()),
411 };
412 for e in self.entries.iter() {
413 if e.left_id >= self.max_left {
414 return ctx.err(BuildFailure::InvalidFieldSize {
415 actual: e.left_id as _,
416 expected: self.max_left as _,
417 field: "left_id",
418 });
419 }
420
421 if e.right_id >= self.max_right {
422 return ctx.err(BuildFailure::InvalidFieldSize {
423 actual: e.right_id as _,
424 expected: self.max_right as _,
425 field: "right_id",
426 });
427 }
428
429 if e.dic_form != WordId::INVALID {
430 ctx.transform(Self::validate_wid(e.dic_form, max_0, max_1, "dic_form"))?;
431 }
432
433 for s in e.splits_a.iter() {
434 match s {
435 SplitUnit::Ref(wid) => {
436 ctx.transform(Self::validate_wid(*wid, max_0, max_1, "splits_a"))?;
437 }
438 _ => panic!("at this point there must not be unresolved splits"),
439 }
440 }
441
442 for s in e.splits_b.iter() {
443 match s {
444 SplitUnit::Ref(wid) => {
445 ctx.transform(Self::validate_wid(*wid, max_0, max_1, "splits_b"))?;
446 }
447 _ => panic!("at this point there must not be unresolved splits"),
448 }
449 }
450
451 for wid in e.word_structure.iter() {
452 ctx.transform(Self::validate_wid(*wid, max_0, max_1, "word_structure"))?;
453 }
454
455 ctx.add_line(1);
456 }
457 Ok(())
458 }
459
460 fn validate_wid(
461 wid: WordId,
462 dic0_max: usize,
463 dic1_max: usize,
464 label: &'static str,
465 ) -> DicWriteResult<()> {
466 let max = match wid.dic() {
467 0 => dic0_max,
468 1 => dic1_max,
469 x => panic!("invalid dictionary ID={}, should not happen", x),
470 };
471 if wid.word() >= max as u32 {
472 return Err(BuildFailure::InvalidFieldSize {
473 actual: wid.word() as _,
474 expected: max,
475 field: label,
476 });
477 }
478 Ok(())
479 }
480
481 fn parse_splits(&mut self, data: &str) -> DicWriteResult<(Vec<SplitUnit>, usize)> {
482 if data.is_empty() || data == "*" {
483 return Ok((Vec::new(), 0));
484 }
485
486 parse_slash_list(data, |s| self.parse_split(s)).map(|splits| {
487 let unresolved = splits
488 .iter()
489 .map(|s| match s {
490 SplitUnit::Inline { .. } => 1,
491 _ => 0,
492 })
493 .sum();
494 (splits, unresolved)
495 })
496 }
497
498 fn parse_split(&mut self, data: &str) -> DicWriteResult<SplitUnit> {
499 if WORD_ID_LITERAL.is_match(data) {
500 Ok(SplitUnit::Ref(parse_wordid(data)?))
501 } else {
502 let mut iter = data.splitn(8, ',');
503 let surface = it_next(data, &mut iter, "(1) surface", unescape)?;
504 let p1 = it_next(data, &mut iter, "(2) pos-1", unescape_cow)?;
505 let p2 = it_next(data, &mut iter, "(3) pos-2", unescape_cow)?;
506 let p3 = it_next(data, &mut iter, "(4) pos-3", unescape_cow)?;
507 let p4 = it_next(data, &mut iter, "(5) pos-4", unescape_cow)?;
508 let p5 = it_next(data, &mut iter, "(6) pos-conj-1", unescape_cow)?;
509 let p6 = it_next(data, &mut iter, "(7) pos-conj-2", unescape_cow)?;
510 let reading = it_next(data, &mut iter, "(8) surface", unescape_cow)?;
511
512 let pos = self.pos_of([p1, p2, p3, p4, p5, p6])?;
513 Ok(SplitUnit::Inline {
514 pos,
515 reading: none_if_equal(&surface, reading),
516 surface,
517 })
518 }
519 }
520
521 pub fn write_pos_table<W: Write>(&self, w: &mut W) -> SudachiResult<usize> {
522 let mut u16w = Utf16Writer::new();
523 let real_count = self.pos.len() - self.start_pos;
524 w.write_all(&u16::to_le_bytes(real_count as u16))?;
525 let mut written_bytes = 2;
526 let mut ctx = DicCompilationCtx::default();
527 ctx.set_filename("<pos-table>".to_owned());
528 for (row, pos_id) in self.pos.iter() {
529 if (*pos_id as usize) < self.start_pos {
530 continue;
531 }
532 for field in row.fields() {
533 ctx.apply(|| u16w.write(w, field).map(|written| written_bytes += written))?;
534 }
535 ctx.add_line(1);
536 }
537 Ok(written_bytes)
538 }
539
540 pub(crate) fn resolve_splits<R: SplitUnitResolver>(
542 &mut self,
543 resolver: &R,
544 ) -> Result<usize, (String, usize)> {
545 let mut total = 0;
546 for (line, e) in self.entries.iter_mut().enumerate() {
547 for s in e.splits_a.iter_mut() {
548 match Self::resolve_split(s, resolver) {
549 Some(val) => total += val,
550 None => {
551 let s: &SplitUnit = unsafe { std::mem::transmute(&*s) };
555 let split_info = s.format(self);
556 return Err((split_info, line));
557 }
558 }
559 }
560 for s in e.splits_b.iter_mut() {
561 match Self::resolve_split(s, resolver) {
562 Some(val) => total += val,
563 None => {
564 let s: &SplitUnit = unsafe { std::mem::transmute(&*s) };
568 let split_info = s.format(self);
569 return Err((split_info, line));
570 }
571 }
572 }
573 }
574 Ok(total)
575 }
576
577 fn resolve_split<R: SplitUnitResolver>(unit: &mut SplitUnit, resolver: &R) -> Option<usize> {
578 match unit {
579 SplitUnit::Ref(_) => Some(0),
580 _ => {
581 let wid = resolver.resolve(&*unit)?;
582 *unit = SplitUnit::Ref(wid);
583 Some(1)
584 }
585 }
586 }
587}
588
589struct RecordWrapper<'a> {
590 pub record: &'a StringRecord,
591 pub ctx: DicCompilationCtx,
592}
593
594impl<'a> RecordWrapper<'a> {
595 #[inline(always)]
596 fn get<T, F>(&self, idx: usize, name: &'static str, f: F) -> SudachiResult<T>
597 where
598 F: FnOnce(&'a str) -> DicWriteResult<T>,
599 {
600 match self.record.get(idx) {
601 Some(s) => self.ctx.transform(f(s)),
602 None => self.ctx.err(BuildFailure::NoRawField(name)),
603 }
604 }
605
606 #[inline(always)]
607 fn get_or_default<T, F>(&self, idx: usize, _name: &'static str, f: F) -> SudachiResult<T>
608 where
609 F: FnOnce(&'a str) -> DicWriteResult<T>,
610 T: Default,
611 {
612 match self.record.get(idx) {
613 Some(s) => self.ctx.transform(f(s)),
614 None => Ok(<T as Default>::default()),
615 }
616 }
617}
618
619pub struct LexiconWriter<'a> {
620 entries: &'a [RawLexiconEntry],
621 u16: Utf16Writer,
622 buffer: Vec<u8>,
623 offset: usize,
624 reporter: &'a mut Reporter,
625}
626
627impl<'a> LexiconWriter<'a> {
628 pub(crate) fn new(
629 entries: &'a [RawLexiconEntry],
630 offset: usize,
631 reporter: &'a mut Reporter,
632 ) -> Self {
633 Self {
634 buffer: Vec::with_capacity(entries.len() * 32),
635 entries,
636 u16: Utf16Writer::new(),
637 offset,
638 reporter,
639 }
640 }
641
642 pub fn write<W: Write>(&mut self, w: &mut W) -> SudachiResult<usize> {
643 let mut ctx = DicCompilationCtx::memory();
644 ctx.set_filename("<write entries>".to_owned());
645 let mut total = 4;
646
647 let num_entries = self.entries.len() as u32;
648 w.write_all(&num_entries.to_le_bytes())?;
649
650 let rep = ReportBuilder::new("word_params");
651 ctx.set_line(0);
652 for e in self.entries {
653 total += ctx.transform(e.write_params(w))?;
654 ctx.add_line(1);
655 }
656 self.reporter.collect(total, rep);
657 let start = total;
658
659 let rep = ReportBuilder::new("wordinfo_offsets");
660 ctx.set_line(0);
661 let offset_base = self.offset + (6 + 4) * self.entries.len() + 4;
662 let mut word_offset = 0;
663 for e in self.entries {
664 let u32_offset = (offset_base + word_offset) as u32;
665 w.write_all(&u32_offset.to_le_bytes())?;
666 let size = ctx.transform(e.write_word_info(&mut self.u16, &mut self.buffer))?;
667 word_offset += size;
668 total += 4;
669 ctx.add_line(1);
670 }
671 self.reporter.collect(total - start, rep);
672
673 let rep = ReportBuilder::new("wordinfos (copy only)");
674 let info_size = self.buffer.len();
675 w.write_all(&self.buffer)?;
676 self.reporter.collect(info_size, rep);
677
678 Ok(total + info_size)
679 }
680}