use crate::array::DictionaryKey;
use crate::error::{Error, Result};
use crate::offset::{Offset, Offsets, OffsetsBuffer};
pub(crate) trait OffsetsContainer<O> {
fn last(&self) -> usize;
fn as_slice(&self) -> &[O];
}
impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {
#[inline]
fn last(&self) -> usize {
self.last().to_usize()
}
#[inline]
fn as_slice(&self) -> &[O] {
self.buffer()
}
}
impl<O: Offset> OffsetsContainer<O> for Offsets<O> {
#[inline]
fn last(&self) -> usize {
self.last().to_usize()
}
#[inline]
fn as_slice(&self) -> &[O] {
self.as_slice()
}
}
pub(crate) fn try_check_offsets_bounds<O: Offset, C: OffsetsContainer<O>>(
offsets: &C,
values_len: usize,
) -> Result<()> {
if offsets.last() > values_len {
Err(Error::oos("offsets must not exceed the values length"))
} else {
Ok(())
}
}
pub(crate) fn try_check_utf8<O: Offset, C: OffsetsContainer<O>>(
offsets: &C,
values: &[u8],
) -> Result<()> {
if offsets.as_slice().len() == 1 {
return Ok(());
}
try_check_offsets_bounds(offsets, values.len())?;
if values.is_ascii() {
Ok(())
} else {
simdutf8::basic::from_utf8(values)?;
let offsets = offsets.as_slice();
let last = offsets
.iter()
.enumerate()
.skip(1)
.rev()
.find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i));
let last = if let Some(last) = last {
last
} else {
return Ok(());
};
let starts = unsafe { offsets.get_unchecked(..=last) };
let mut any_invalid = false;
for start in starts {
let start = start.to_usize();
let b = *unsafe { values.get_unchecked(start) };
if (b as i8) < -0x40 {
any_invalid = true
}
}
if any_invalid {
return Err(Error::oos("Non-valid char boundary detected"));
}
Ok(())
}
}
pub(crate) unsafe fn check_indexes_unchecked<K: DictionaryKey>(
keys: &[K],
len: usize,
) -> Result<()> {
let mut invalid = false;
keys.iter().for_each(|k| {
if k.as_usize() > len {
invalid = true;
}
});
if invalid {
let key = keys.iter().map(|k| k.as_usize()).max().unwrap();
Err(Error::oos(format!("One of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")))
} else {
Ok(())
}
}
pub fn check_indexes<K>(keys: &[K], len: usize) -> Result<()>
where
K: std::fmt::Debug + Copy + TryInto<usize>,
{
keys.iter().try_for_each(|key| {
let key: usize = (*key)
.try_into()
.map_err(|_| Error::oos(format!("The dictionary key must fit in a `usize`, but {key:?} does not")))?;
if key >= len {
Err(Error::oos(format!("One of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")))
} else {
Ok(())
}
})
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
prop::collection::vec(any::<u8>(), 1..100)
}
proptest! {
#[test]
#[cfg_attr(miri, ignore)] fn check_utf8_validation(values in binary_strategy()) {
for offset in 0..values.len() - 1 {
let offsets = vec![0, offset as i32, values.len() as i32].try_into().unwrap();
let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
is_valid &= std::str::from_utf8(&values[offset..]).is_ok();
assert_eq!(try_check_utf8::<i32, Offsets<i32>>(&offsets, &values).is_ok(), is_valid)
}
}
}
}