sudachi/dic/read/
u16str.rs

1/*
2 *  Copyright (c) 2021 Works Applications Co., Ltd.
3 *
4 *  Licensed under the Apache License, Version 2.0 (the "License");
5 *  you may not use this file except in compliance with the License.
6 *  You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *   Unless required by applicable law or agreed to in writing, software
11 *  distributed under the License is distributed on an "AS IS" BASIS,
12 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 *  See the License for the specific language governing permissions and
14 *  limitations under the License.
15 */
16
17use crate::error::{SudachiNomError, SudachiNomResult};
18use nom::number::complete::le_u8;
19use std::iter::FusedIterator;
20
21pub fn utf16_string_parser(input: &[u8]) -> SudachiNomResult<&[u8], String> {
22    utf16_string_data(input).and_then(|(rest, data)| {
23        if data.is_empty() {
24            Ok((rest, String::new()))
25        } else {
26            // most Japanese chars are 3-bytes in utf-8 and 2 in utf-16
27            let capacity = (data.len() + 1) * 3 / 2;
28            let mut result = String::with_capacity(capacity);
29            let iter = U16CodeUnits::new(data);
30            for c in char::decode_utf16(iter) {
31                match c {
32                    Err(_) => return Err(nom::Err::Failure(SudachiNomError::Utf16String)),
33                    Ok(c) => result.push(c),
34                }
35            }
36            Ok((rest, result))
37        }
38    })
39}
40
41pub fn skip_u16_string(input: &[u8]) -> SudachiNomResult<&[u8], String> {
42    utf16_string_data(input).map(|(rest, _)| (rest, String::new()))
43}
44
45#[inline]
46pub fn utf16_string_data(input: &[u8]) -> SudachiNomResult<&[u8], &[u8]> {
47    let (rest, length) = string_length_parser(input)?;
48    if length == 0 {
49        return Ok((rest, &[]));
50    }
51    let num_bytes = (length * 2) as usize;
52    if rest.len() < num_bytes {
53        return Err(nom::Err::Failure(SudachiNomError::Utf16String));
54    }
55
56    let (data, rest) = rest.split_at(num_bytes);
57
58    Ok((rest, data))
59}
60
61pub fn string_length_parser(input: &[u8]) -> SudachiNomResult<&[u8], u16> {
62    let (rest, length) = le_u8(input)?;
63    // word length can be 1 or 2 bytes
64    let (rest, opt_low) = nom::combinator::cond(length >= 128, le_u8)(rest)?;
65    Ok((
66        rest,
67        match opt_low {
68            Some(low) => ((length as u16 & 0x7F) << 8) | low as u16,
69            None => length as u16,
70        },
71    ))
72}
73
74/// Read UTF-16 code units from non-aligned storage
75pub struct U16CodeUnits<'a> {
76    data: &'a [u8],
77    offset: usize,
78}
79
80impl<'a> U16CodeUnits<'a> {
81    pub fn new(data: &'a [u8]) -> Self {
82        U16CodeUnits { data, offset: 0 }
83    }
84}
85
86impl Iterator for U16CodeUnits<'_> {
87    type Item = u16;
88
89    fn next(&mut self) -> Option<Self::Item> {
90        if self.data.len() <= self.offset {
91            return None;
92        }
93        let p1 = self.data[self.offset];
94        let p2 = self.data[self.offset + 1];
95        self.offset += 2;
96        Some(u16::from_le_bytes([p1, p2]))
97    }
98
99    fn size_hint(&self) -> (usize, Option<usize>) {
100        let rem = self.data.len() - self.offset;
101        (rem, Some(rem))
102    }
103}
104
105impl FusedIterator for U16CodeUnits<'_> {}