sudachi/dic/
header.rs

1/*
2 * Copyright (c) 2021-2024 Works Applications Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use nom::{bytes::complete::take, number::complete::le_u64};
18use std::io::Write;
19use std::time::{Duration, SystemTime};
20use thiserror::Error;
21
22use crate::error::{SudachiError, SudachiNomResult, SudachiResult};
23
24/// Sudachi error
25#[derive(Error, Debug, Eq, PartialEq)]
26#[non_exhaustive]
27pub enum HeaderError {
28    #[error("Invalid header version")]
29    InvalidVersion,
30
31    #[error("Invalid system dictionary version")]
32    InvalidSystemDictVersion,
33
34    #[error("Invalid user dictionary version")]
35    InvalidUserDictVersion,
36
37    #[error("Unable to parse")]
38    CannotParse,
39}
40
41/// Header version
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub enum HeaderVersion {
44    SystemDict(SystemDictVersion),
45    UserDict(UserDictVersion),
46}
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub enum SystemDictVersion {
49    // we cannot set value since value can be larger than isize
50    Version1,
51    Version2,
52}
53
54impl HeaderVersion {
55    pub fn to_u64(&self) -> u64 {
56        #[allow(unreachable_patterns)]
57        match self {
58            HeaderVersion::SystemDict(SystemDictVersion::Version1) => {
59                HeaderVersion::SYSTEM_DICT_VERSION_1
60            }
61            HeaderVersion::SystemDict(SystemDictVersion::Version2) => {
62                HeaderVersion::SYSTEM_DICT_VERSION_2
63            }
64            HeaderVersion::UserDict(UserDictVersion::Version1) => {
65                HeaderVersion::USER_DICT_VERSION_1
66            }
67            HeaderVersion::UserDict(UserDictVersion::Version2) => {
68                HeaderVersion::USER_DICT_VERSION_2
69            }
70            HeaderVersion::UserDict(UserDictVersion::Version3) => {
71                HeaderVersion::USER_DICT_VERSION_3
72            }
73            _ => panic!("unknown version {:?}", self),
74        }
75    }
76}
77
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub enum UserDictVersion {
80    Version1,
81    Version2,
82    Version3,
83}
84impl HeaderVersion {
85    /// the first version of system dictionaries
86    const SYSTEM_DICT_VERSION_1: u64 = 0x7366d3f18bd111e7;
87    /// the second version of system dictionaries
88    const SYSTEM_DICT_VERSION_2: u64 = 0xce9f011a92394434;
89    /// the first version of user dictionaries
90    const USER_DICT_VERSION_1: u64 = 0xa50f31188bd211e7;
91    /// the second version of user dictionaries
92    const USER_DICT_VERSION_2: u64 = 0x9fdeb5a90168d868;
93    /// the third version of user dictionaries
94    const USER_DICT_VERSION_3: u64 = 0xca9811756ff64fb0;
95
96    pub fn from_u64(v: u64) -> Option<Self> {
97        match v {
98            HeaderVersion::SYSTEM_DICT_VERSION_1 => {
99                Some(Self::SystemDict(SystemDictVersion::Version1))
100            }
101            HeaderVersion::SYSTEM_DICT_VERSION_2 => {
102                Some(Self::SystemDict(SystemDictVersion::Version2))
103            }
104            HeaderVersion::USER_DICT_VERSION_1 => Some(Self::UserDict(UserDictVersion::Version1)),
105            HeaderVersion::USER_DICT_VERSION_2 => Some(Self::UserDict(UserDictVersion::Version2)),
106            HeaderVersion::USER_DICT_VERSION_3 => Some(Self::UserDict(UserDictVersion::Version3)),
107            _ => None,
108        }
109    }
110}
111
112/// Dictionary header
113///
114/// Contains version, create_time, and description
115#[derive(Debug, Clone, Eq, PartialEq)]
116pub struct Header {
117    pub version: HeaderVersion,
118    pub create_time: u64,
119    pub description: String,
120}
121
122impl Default for Header {
123    fn default() -> Self {
124        Self::new()
125    }
126}
127
128impl Header {
129    const DESCRIPTION_SIZE: usize = 256;
130    pub const STORAGE_SIZE: usize = 8 + 8 + Header::DESCRIPTION_SIZE;
131
132    /// Creates new system dictionary header
133    /// Its version field should be modified to create user dictionary header
134    pub fn new() -> Self {
135        let unix_time = SystemTime::now()
136            .duration_since(SystemTime::UNIX_EPOCH)
137            .expect("unix time error");
138
139        Self {
140            version: HeaderVersion::SystemDict(SystemDictVersion::Version2),
141            create_time: unix_time.as_secs(),
142            description: String::new(),
143        }
144    }
145
146    /// Set creation time
147    pub fn set_time(&mut self, time: SystemTime) -> SystemTime {
148        let unix_time = time
149            .duration_since(SystemTime::UNIX_EPOCH)
150            .expect("unix time error");
151
152        let old_unix_secs = std::mem::replace(&mut self.create_time, unix_time.as_secs());
153
154        SystemTime::UNIX_EPOCH + Duration::from_secs(old_unix_secs)
155    }
156
157    /// Creates a new header from a dictionary bytes
158    pub fn parse(bytes: &[u8]) -> Result<Header, HeaderError> {
159        let (_rest, (version, create_time, description)) =
160            header_parser(bytes).map_err(|_| HeaderError::CannotParse)?;
161
162        let version = HeaderVersion::from_u64(version).ok_or(HeaderError::InvalidVersion)?;
163
164        Ok(Header {
165            version,
166            create_time,
167            description,
168        })
169    }
170
171    /// Returns if this header version has grammar
172    pub fn has_grammar(&self) -> bool {
173        matches!(
174            self.version,
175            HeaderVersion::SystemDict(_)
176                | HeaderVersion::UserDict(UserDictVersion::Version2)
177                | HeaderVersion::UserDict(UserDictVersion::Version3)
178        )
179    }
180
181    /// Returns if this header version has synonym group ids
182    pub fn has_synonym_group_ids(&self) -> bool {
183        matches!(
184            self.version,
185            HeaderVersion::SystemDict(SystemDictVersion::Version2)
186                | HeaderVersion::UserDict(UserDictVersion::Version3)
187        )
188    }
189
190    pub fn write_to<W: Write>(&self, w: &mut W) -> SudachiResult<usize> {
191        if self.description.len() > Header::DESCRIPTION_SIZE {
192            return Err(SudachiError::InvalidDataFormat(
193                Header::DESCRIPTION_SIZE,
194                self.description.clone(),
195            ));
196        }
197
198        w.write_all(&self.version.to_u64().to_le_bytes())?;
199        w.write_all(&self.create_time.to_le_bytes())?;
200        w.write_all(self.description.as_bytes())?;
201        for _ in 0..Header::DESCRIPTION_SIZE - self.description.len() {
202            w.write_all(&[0])?;
203        }
204        Ok(Header::STORAGE_SIZE)
205    }
206}
207
208/// Create String from UTF-8 bytes up to NUL byte or end of slice (whichever is first)
209fn nul_terminated_str_from_slice(buf: &[u8]) -> String {
210    let str_bytes: &[u8] = if let Some(nul_idx) = buf.iter().position(|b| *b == 0) {
211        &buf[..nul_idx]
212    } else {
213        buf
214    };
215    String::from_utf8_lossy(str_bytes).to_string()
216}
217
218fn description_parser(input: &[u8]) -> SudachiNomResult<&[u8], String> {
219    let (rest, description_bytes) = take(Header::DESCRIPTION_SIZE)(input)?;
220    Ok((rest, nul_terminated_str_from_slice(description_bytes)))
221}
222
223fn header_parser(input: &[u8]) -> SudachiNomResult<&[u8], (u64, u64, String)> {
224    nom::sequence::tuple((le_u64, le_u64, description_parser))(input)
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    fn header_from_parts<T: AsRef<[u8]>>(
232        version: u64,
233        create_time: u64,
234        description: T,
235    ) -> Result<Header, HeaderError> {
236        let mut bytes = Vec::new();
237        bytes.extend(&version.to_le_bytes());
238        bytes.extend(&create_time.to_le_bytes());
239        bytes.extend(description.as_ref());
240
241        Header::parse(&bytes)
242    }
243
244    #[test]
245    fn graceful_failure() {
246        // Too small
247        assert_eq!(Header::parse(&[]), Err(HeaderError::CannotParse));
248
249        assert_eq!(
250            header_from_parts(42, 0, vec![0; Header::DESCRIPTION_SIZE]),
251            Err(HeaderError::InvalidVersion)
252        );
253    }
254
255    #[test]
256    fn simple_header() {
257        let mut description: Vec<u8> = Vec::new();
258        let description_str = "My Description";
259        description.extend(description_str.bytes());
260        description.extend(&vec![0; Header::DESCRIPTION_SIZE]);
261
262        assert_eq!(
263            header_from_parts(HeaderVersion::SYSTEM_DICT_VERSION_1, 1337, &description),
264            Ok(Header {
265                version: HeaderVersion::SystemDict(SystemDictVersion::Version1),
266                description: description_str.to_string(),
267                create_time: 1337,
268            })
269        );
270    }
271
272    #[test]
273    fn write_system() {
274        let header = Header::new();
275        let mut data: Vec<u8> = Vec::new();
276        assert_eq!(header.write_to(&mut data).unwrap(), Header::STORAGE_SIZE);
277        let header2 = Header::parse(&data).unwrap();
278        assert_eq!(header, header2);
279    }
280
281    #[test]
282    fn write_user() {
283        let mut header = Header::new();
284        header.version = HeaderVersion::UserDict(UserDictVersion::Version3);
285        header.description = String::from("some great header");
286        let mut data: Vec<u8> = Vec::new();
287        assert_eq!(header.write_to(&mut data).unwrap(), Header::STORAGE_SIZE);
288        let header2 = Header::parse(&data).unwrap();
289        assert_eq!(header, header2);
290    }
291}