1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
use std::borrow::Borrow;
use std::fmt::{self, Debug};
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};

use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;

use crate::array::Array;
use crate::{
    array::indexable::{AsIndexed, Indexable},
    array::MutableArray,
    datatypes::DataType,
    error::{Error, Result},
};

use super::DictionaryKey;

/// Hasher for pre-hashed values; similar to `hash_hasher` but with native endianness.
///
/// We know that we'll only use it for `u64` values, so we can avoid endian conversion.
///
/// Invariant: hash of a u64 value is always equal to itself.
#[derive(Copy, Clone, Default)]
pub struct PassthroughHasher(u64);

impl Hasher for PassthroughHasher {
    #[inline]
    fn write_u64(&mut self, value: u64) {
        self.0 = value;
    }

    fn write(&mut self, _: &[u8]) {
        unreachable!();
    }

    #[inline]
    fn finish(&self) -> u64 {
        self.0
    }
}

#[derive(Clone)]
pub struct Hashed<K> {
    hash: u64,
    key: K,
}

#[inline]
fn ahash_hash<T: Hash + ?Sized>(value: &T) -> u64 {
    let mut hasher = BuildHasherDefault::<ahash::AHasher>::default().build_hasher();
    value.hash(&mut hasher);
    hasher.finish()
}

impl<K> Hash for Hashed<K> {
    #[inline]
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.hash.hash(state)
    }
}

#[derive(Clone)]
pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
    pub values: M,
    pub map: HashMap<Hashed<K>, (), BuildHasherDefault<PassthroughHasher>>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
}

impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
    pub fn try_empty(values: M) -> Result<Self> {
        if !values.is_empty() {
            return Err(Error::InvalidArgumentError(
                "initializing value map with non-empty values array".into(),
            ));
        }
        Ok(Self {
            values,
            map: HashMap::default(),
        })
    }

    pub fn from_values(values: M) -> Result<Self>
    where
        M: Indexable,
        M::Type: Eq + Hash,
    {
        let mut map = HashMap::<Hashed<K>, _, _>::with_capacity_and_hasher(
            values.len(),
            BuildHasherDefault::<PassthroughHasher>::default(),
        );
        for index in 0..values.len() {
            let key = K::try_from(index).map_err(|_| Error::Overflow)?;
            // safety: we only iterate within bounds
            let value = unsafe { values.value_unchecked_at(index) };
            let hash = ahash_hash(value.borrow());
            match map.raw_entry_mut().from_hash(hash, |item| {
                // safety: invariant of the struct, it's always in bounds since we maintain it
                let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) };
                stored_value.borrow() == value.borrow()
            }) {
                RawEntryMut::Occupied(_) => {
                    return Err(Error::InvalidArgumentError(
                        "duplicate value in dictionary values array".into(),
                    ))
                }
                RawEntryMut::Vacant(entry) => {
                    // NB: don't use .insert() here!
                    entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ());
                }
            }
        }
        Ok(Self { values, map })
    }

    pub fn data_type(&self) -> &DataType {
        self.values.data_type()
    }

    pub fn into_values(self) -> M {
        self.values
    }

    pub fn take_into(&mut self) -> Box<dyn Array> {
        let arr = self.values.as_box();
        self.map.clear();
        arr
    }

    #[inline]
    pub fn values(&self) -> &M {
        &self.values
    }

    /// Try to insert a value and return its index (it may or may not get inserted).
    pub fn try_push_valid<V>(
        &mut self,
        value: V,
        mut push: impl FnMut(&mut M, V) -> Result<()>,
    ) -> Result<K>
    where
        M: Indexable,
        V: AsIndexed<M>,
        M::Type: Eq + Hash,
    {
        let hash = ahash_hash(value.as_indexed());
        Ok(
            match self.map.raw_entry_mut().from_hash(hash, |item| {
                // safety: we've already checked (the inverse) when we pushed it, so it should be ok?
                let index = unsafe { item.key.as_usize() };
                // safety: invariant of the struct, it's always in bounds since we maintain it
                let stored_value = unsafe { self.values.value_unchecked_at(index) };
                stored_value.borrow() == value.as_indexed()
            }) {
                RawEntryMut::Occupied(entry) => entry.key().key,
                RawEntryMut::Vacant(entry) => {
                    let index = self.values.len();
                    let key = K::try_from(index).map_err(|_| Error::Overflow)?;
                    entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here!
                    push(&mut self.values, value)?;
                    debug_assert_eq!(self.values.len(), index + 1);
                    key
                }
            },
        )
    }

    pub fn shrink_to_fit(&mut self) {
        self.values.shrink_to_fit();
    }
}

impl<K: DictionaryKey, M: MutableArray> Debug for ValueMap<K, M> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.values.fmt(f)
    }
}