sudachi/dic/build/
primitives.rs

1/*
2 *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::dic::build::error::BuildFailure::InvalidSize;
18use crate::dic::build::error::DicWriteResult;
19use crate::dic::build::lexicon::SplitUnit;
20use crate::dic::word_id::WordId;
21use std::io::Write;
22
23pub struct Utf16Writer {
24    buffer: Vec<u8>,
25}
26
27impl Utf16Writer {
28    pub fn new() -> Self {
29        Utf16Writer {
30            buffer: Vec::with_capacity(256),
31        }
32    }
33
34    pub fn write_len<W: Write>(&self, w: &mut W, length: usize) -> DicWriteResult<usize> {
35        if length > i16::MAX as _ {
36            return Err(InvalidSize {
37                actual: length,
38                expected: i16::MAX as _,
39            });
40        }
41
42        let length = length as u16;
43
44        let prefix = if length < 127 {
45            w.write_all(&[length as u8])?;
46            1
47        } else {
48            let b0 = length as u8;
49            let b1 = ((length >> 8) as u8) | 0x80;
50            w.write_all(&[b1, b0])?;
51            2
52        };
53
54        Ok(prefix)
55    }
56
57    pub fn write<W: Write, T: AsRef<str>>(&mut self, w: &mut W, data: T) -> DicWriteResult<usize> {
58        let str_data: &str = data.as_ref();
59        if str_data.len() > 4 * 64 * 1024 {
60            return Err(InvalidSize {
61                actual: str_data.len(),
62                expected: 4 * 64 * 1024,
63            });
64        }
65
66        let mut scratch: [u16; 2] = [0; 2];
67        let mut length: usize = 0;
68        self.buffer.clear();
69
70        for c in str_data.chars() {
71            for u16c in c.encode_utf16(&mut scratch) {
72                self.buffer.extend_from_slice(&u16c.to_le_bytes());
73                length += 1;
74            }
75        }
76
77        let prefix = self.write_len(w, length)?;
78        w.write_all(&self.buffer)?;
79        Ok(prefix + self.buffer.len())
80    }
81
82    pub fn write_empty_if_equal<W, T1, T2>(
83        &mut self,
84        w: &mut W,
85        data: T1,
86        other: T2,
87    ) -> DicWriteResult<usize>
88    where
89        W: Write,
90        T1: AsRef<str> + PartialEq<T2>,
91    {
92        if data == other {
93            self.write(w, "")
94        } else {
95            self.write(w, data)
96        }
97    }
98}
99
100pub(crate) trait ToU32 {
101    fn to_u32(&self) -> u32;
102}
103
104impl ToU32 for u32 {
105    fn to_u32(&self) -> u32 {
106        *self
107    }
108}
109
110impl ToU32 for i32 {
111    fn to_u32(&self) -> u32 {
112        *self as u32
113    }
114}
115
116impl ToU32 for WordId {
117    fn to_u32(&self) -> u32 {
118        self.as_raw()
119    }
120}
121
122impl ToU32 for SplitUnit {
123    fn to_u32(&self) -> u32 {
124        match self {
125            SplitUnit::Ref(w) => w.to_u32(),
126            SplitUnit::Inline { .. } => panic!("splits must be resolved before writing"),
127        }
128    }
129}
130
131pub(crate) fn write_u32_array<W: Write, T: ToU32>(w: &mut W, data: &[T]) -> DicWriteResult<usize> {
132    let len = data.len();
133    if len > 127 {
134        return Err(InvalidSize {
135            expected: 127,
136            actual: len,
137        });
138    }
139    w.write_all(&[len as u8])?;
140    let mut written = 1;
141
142    for o in data {
143        let i = o.to_u32();
144        w.write_all(&i.to_le_bytes())?;
145        written += 4;
146    }
147
148    Ok(written)
149}
150
151#[cfg(test)]
152mod test {
153    use crate::dic::build::error::DicWriteResult;
154    use crate::dic::build::primitives::{write_u32_array, Utf16Writer};
155    use crate::dic::read::u16str::utf16_string_parser;
156    use crate::dic::read::u32_array_parser;
157    use claim::assert_matches;
158
159    #[test]
160    fn write_utf16() {
161        let mut writer = Utf16Writer::new();
162        let mut data: Vec<u8> = Vec::new();
163        writer
164            .write(&mut data, "これはテスト文です")
165            .expect("success");
166        let (remaining, parsed) = utf16_string_parser(&data).expect("parsed");
167        assert_eq!(0, remaining.len());
168        assert_eq!("これはテスト文です", parsed);
169    }
170
171    #[test]
172    fn write_strings() -> DicWriteResult<()> {
173        let mut writer = Utf16Writer::new();
174        let mut data: Vec<u8> = Vec::new();
175
176        let xstr = "";
177        let mut w = writer.write(&mut data, xstr)?;
178        assert_eq!(data.len(), w);
179        let ystr = "あ𠮟";
180        w += writer.write(&mut data, ystr)?;
181        assert_eq!(data.len(), w);
182        let zstr = "0123456789".repeat(15); // > 127 symbols
183        w += writer.write(&mut data, &zstr)?;
184        assert_eq!(data.len(), w);
185        let (rem, parsed) = utf16_string_parser(&data).expect("ok");
186        assert_eq!(parsed, xstr);
187        let (rem, parsed) = utf16_string_parser(rem).expect("ok");
188        assert_eq!(parsed, ystr);
189        let (rem, parsed) = utf16_string_parser(rem).expect("ok");
190        assert_eq!(parsed, zstr);
191        assert_eq!(rem.len(), 0);
192
193        Ok(())
194    }
195
196    #[test]
197    fn write_ints_empty() {
198        let mut data: Vec<u8> = Vec::new();
199        let written = write_u32_array(&mut data, &[0u32; 0]).expect("ok");
200        assert_eq!(written, 1);
201        assert_eq!(data, b"\0");
202    }
203
204    #[test]
205    fn write_ints_full() {
206        let mut data: Vec<u8> = Vec::new();
207        let array = [0, 5, u32::MAX, u32::MIN];
208        let written = write_u32_array(&mut data, &array).expect("ok");
209        let (rem, parsed) = u32_array_parser(&data).expect("ok");
210        assert_eq!(rem, b"");
211        assert_eq!(parsed, array);
212        assert_eq!(written, 4 * 4 + 1);
213    }
214
215    #[test]
216    fn write_ints_over_length() {
217        let mut data: Vec<u8> = Vec::new();
218        let array = [0u32; 130];
219        let status = write_u32_array(&mut data, &array);
220        assert_matches!(status, Err(_));
221    }
222}