-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathidentifier.rs
380 lines (328 loc) · 11.7 KB
/
identifier.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
//! Define the type of an identifier.
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::{
borrow::Borrow,
fmt::{self, Debug},
hash::Hash,
};
use crate::{metrics::increment, position::TermPos, term::string::NickelString};
simple_counter::generate_counter!(GeneratedCounter, usize);
static INTERNER: Lazy<interner::Interner> = Lazy::new(interner::Interner::new);
/// An interned identifier.
//
// Implementation-wise, this is just a wrapper around interner::Symbol that uses a hard-coded,
// static `Interner`.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(into = "String", from = "String")]
pub struct Ident(interner::Symbol);
impl Ident {
pub fn new(s: impl AsRef<str>) -> Self {
Self(INTERNER.intern(s.as_ref()))
}
/// Return the string representation of this identifier.
pub fn label(&self) -> &str {
INTERNER.lookup(self.0)
}
pub fn into_label(self) -> String {
self.label().to_owned()
}
}
impl fmt::Display for Ident {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.label())
}
}
impl fmt::Debug for Ident {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "`{}`", self.label())
}
}
impl From<Ident> for LocIdent {
fn from(ident: Ident) -> Self {
LocIdent {
ident,
pos: TermPos::None,
generated: ident.label().starts_with(GEN_PREFIX),
}
}
}
impl PartialOrd for Ident {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Ident {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.label().cmp(other.label())
}
}
impl From<Ident> for NickelString {
fn from(sym: Ident) -> Self {
sym.to_string().into()
}
}
impl<F> From<F> for Ident
where
String: From<F>,
{
fn from(val: F) -> Self {
Self(INTERNER.intern(String::from(val)))
}
}
#[allow(clippy::from_over_into)]
impl Into<String> for Ident {
fn into(self) -> String {
self.into_label()
}
}
/// An identifier with a location.
///
/// The location is ignored for equality comparison and hashing; it's mainly
/// intended for error messages.
#[derive(Clone, Copy, Debug, Deserialize, Serialize)]
#[serde(into = "String", from = "String")]
pub struct LocIdent {
ident: Ident,
pub pos: TermPos,
generated: bool,
}
impl LocIdent {
pub fn new_with_pos(label: impl AsRef<str>, pos: TermPos) -> Self {
let generated = label.as_ref().starts_with(GEN_PREFIX);
Self {
ident: Ident::new(label),
pos,
generated,
}
}
pub fn new(label: impl AsRef<str>) -> Self {
Self::new_with_pos(label, TermPos::None)
}
/// Create an identifier with the same label as this one, but a specified position.
pub fn with_pos(self, pos: TermPos) -> LocIdent {
LocIdent { pos, ..self }
}
/// Create a new fresh identifier. This identifier is unique and is guaranteed not to collide
/// with any identifier defined before. Generated identifiers start with a special prefix that
/// can't be used by normal, user-defined identifiers.
pub fn fresh() -> Self {
increment!("LocIdent::fresh");
Self::new(format!("{}{}", GEN_PREFIX, GeneratedCounter::next()))
}
/// Return the identifier without its position.
pub fn ident(&self) -> Ident {
self.ident
}
/// Return the string representation of this identifier.
pub fn label(&self) -> &str {
self.ident.label()
}
pub fn into_label(self) -> String {
self.label().to_owned()
}
}
/// Special character used for generating fresh identifiers. It must be syntactically impossible to
/// use to write in a standard Nickel program, to avoid name clashes.
pub const GEN_PREFIX: char = '%';
impl PartialOrd for LocIdent {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for LocIdent {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.label().cmp(other.label())
}
}
impl PartialEq for LocIdent {
fn eq(&self, other: &Self) -> bool {
self.ident == other.ident
}
}
impl Eq for LocIdent {}
impl Hash for LocIdent {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.ident.hash(state)
}
}
impl Borrow<Ident> for LocIdent {
fn borrow(&self) -> &Ident {
&self.ident
}
}
impl fmt::Display for LocIdent {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.label())
}
}
impl<F> From<F> for LocIdent
where
String: From<F>,
{
fn from(val: F) -> Self {
Self::new(String::from(val))
}
}
// False-positive Clippy error: if we apply this suggestion,
// we end up with an implementation of `From<Ident> for String`.
// Then setting `F = Ident` in the implementation above gives
// `From<Ident> for Ident` which is incoherent with the
// blanket implementation of `From<T> for T`.
#[allow(clippy::from_over_into)]
impl Into<String> for LocIdent {
fn into(self) -> String {
self.into_label()
}
}
impl From<LocIdent> for NickelString {
fn from(id: LocIdent) -> Self {
id.to_string().into()
}
}
impl LocIdent {
pub fn is_generated(&self) -> bool {
self.generated
}
}
impl AsRef<str> for LocIdent {
fn as_ref(&self) -> &str {
self.label()
}
}
mod interner {
use std::collections::HashMap;
use std::sync::{Mutex, RwLock};
use typed_arena::Arena;
/// A symbol is a correspondence between an [Ident](super::Ident) and its string representation
/// stored in the [Interner].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Symbol(u32);
/// The interner, which serves a double purpose: it pre-allocates space
/// so that [Ident](super::Ident) labels are created faster
/// and it makes it so that labels are stored only once, saving space.
// NOTE: We set the lifetime parameter of InnerInterner to 'static since
// it's arbitrary, and there's no reason to expose it to users of Interner
pub(crate) struct Interner(RwLock<InnerInterner<'static>>);
impl Interner {
/// Creates an empty [Interner].
pub(crate) fn new() -> Self {
Self(RwLock::new(InnerInterner::new()))
}
/// Stores a string inside the [Interner] if it does not exists, and returns the
/// corresponding [Symbol].
pub(crate) fn intern(&self, string: impl AsRef<str>) -> Symbol {
self.0.write().unwrap().intern(string)
}
/// Looks up for the stored string corresponding to the [Symbol].
///
/// This operation cannot fails since the only way to have a [Symbol] is to have
/// [interned](Interner::intern) the corresponding string first.
pub(crate) fn lookup<'slf>(&'slf self, sym: Symbol) -> &'slf str {
// SAFETY: We are making the returned &str lifetime the same as our struct,
// which is okay here since the InnerInterner uses a typed_arena which prevents
// deallocations, so the reference will be valid while the InnerInterner exists,
// hence while the struct exists.
unsafe { std::mem::transmute::<&'_ str, &'slf str>(self.0.read().unwrap().lookup(sym)) }
}
}
/// The main part of the Interner.
/// 'internal should never be exposed to the outside, as it is an
/// implementation detail and could be set to anything. It should be treated
/// within the implementation of InnerInterner as the lifetime of the object
/// itself, since that's what it is turned into in the `lookup()` method.
struct InnerInterner<'internal> {
/// Preallocates space where strings are stored.
arena: Mutex<Arena<u8>>,
/// Prevents the arena from creating different [Symbols](Symbol) for the same string.
map: HashMap<&'internal str, Symbol>,
/// Allows retrieving a string from a [Symbol].
vec: Vec<&'internal str>,
}
impl<'internal> InnerInterner<'internal> {
/// Creates an empty [InnerInterner].
fn new() -> Self {
Self {
arena: Mutex::new(Arena::new()),
map: HashMap::new(),
vec: Vec::new(),
}
}
/// Stores a string inside the [InnerInterner] if it does not exists, and returns the
/// corresponding [Symbol].
fn intern(&mut self, string: impl AsRef<str>) -> Symbol {
if let Some(sym) = self.map.get(string.as_ref()) {
return *sym;
}
// SAFETY: Here we are transmuting the reference lifetime: &'arena str -> &'self str
// This is okay since the lifetime of the arena is identical to the one of the struct.
// It is also okay to use it from inside the mutex, since typed_arena does not allow
// deallocation, so references are valid until the arena drop, which is tied to the
// struct drop. Since there is no 'self lifetime, we use 'internal instead, which
// at least has 'internal: 'self.
let in_string = unsafe {
std::mem::transmute::<&'_ str, &'internal str>(
self.arena.lock().unwrap().alloc_str(string.as_ref()),
)
};
let sym = Symbol(self.vec.len() as u32);
self.vec.push(in_string);
self.map.insert(in_string, sym);
sym
}
/// Looks up for the stored string corresponding to the [Symbol].
///
/// This operation cannot fails since the only way to have a [Symbol]
/// is to have [interned](InnerInterner::intern) the corresponding string first.
fn lookup(&self, sym: Symbol) -> &str {
// The lifetime implicitly shrinks from 'internal to 'self, which
// prevents 'internal from leaking out.
self.vec[sym.0 as usize]
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_intern_then_lookup() {
let interner = Interner::new();
let test_string = "test_string";
let sym = interner.intern(test_string);
assert_eq!(interner.lookup(sym), test_string);
}
#[test]
fn test_intern_twice_has_same_symbol() {
let interner = Interner::new();
let test_string = "test_string";
let sym1 = interner.intern(test_string);
let sym2 = interner.intern(test_string);
assert_eq!(sym1, sym2);
}
#[test]
fn test_intern_two_different_has_different_symbols() {
let interner = Interner::new();
let sym1 = interner.intern("a");
let sym2 = interner.intern("b");
assert_ne!(sym1, sym2);
}
#[test]
fn test_large_number_of_interns() {
let interner = Interner::new();
for i in 0..10000 {
let i = i.to_string();
let sym = interner.intern(&i);
assert_eq!(i, interner.lookup(sym));
}
assert_eq!(10000, interner.0.read().unwrap().map.len());
assert_eq!(10000, interner.0.read().unwrap().vec.len());
// doing the same a second time should not add anything to the interner
for i in 0..10000 {
let i = i.to_string();
let sym = interner.intern(&i);
assert_eq!(i, interner.lookup(sym));
}
assert_eq!(10000, interner.0.read().unwrap().map.len());
assert_eq!(10000, interner.0.read().unwrap().vec.len());
}
}
}