Skip to content

Commit 5d51084

Browse files
authored
Simplify Rust code (#1070)
According to #1055 we have `mid_size == 0` and `use_inputs_at_offsets == false`. This simplifies the code significantly. The simplification is fine to do because we only support one model and it's embedded. Once we'll support multiple models (in particular dynamic loading), we'll need features extraction to work for all possible models (possibly having multiple features extraction functions).
1 parent 4a48426 commit 5d51084

File tree

5 files changed

+17
-46
lines changed

5 files changed

+17
-46
lines changed

rust/gen/src/main.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::fs::File;
1717
use std::io::Write;
1818
use std::path::Path;
1919

20-
use anyhow::{Context, Result};
20+
use anyhow::{ensure, Context, Result};
2121
use serde::Deserialize;
2222

2323
fn main() -> Result<()> {
@@ -132,9 +132,9 @@ fn generate_model_config(content_types: &[String], model_config: ModelConfig) ->
132132
writeln!(output, "use crate::ContentType;\n")?;
133133
writeln!(output, "pub(crate) const CONFIG: ModelConfig = ModelConfig {{")?;
134134
writeln!(output, " beg_size: {beg_size},")?;
135-
writeln!(output, " mid_size: {mid_size},")?;
135+
ensure!(mid_size == 0, "unsupported mid_size");
136136
writeln!(output, " end_size: {end_size},")?;
137-
writeln!(output, " use_inputs_at_offsets: {use_inputs_at_offsets},")?;
137+
ensure!(!use_inputs_at_offsets, "unsupported use_inputs_at_offsets");
138138
writeln!(output, " min_file_size_for_dl: {min_file_size_for_dl},")?;
139139
writeln!(output, " padding_token: {padding_token},")?;
140140
writeln!(output, " block_size: {block_size},")?;

rust/lib/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
### Minor
1212

13+
- Remove features extraction logic of older models
1314
- Use the `standard_v3_3` model instead of `standard_v3_2` (see [model changelog])
1415
- Add `OverwriteReason` to document why the inferred content type is overwritten
1516

rust/lib/src/config.rs

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,7 @@ use crate::ContentType;
1919
#[derive(Debug)]
2020
pub(crate) struct ModelConfig {
2121
pub(crate) beg_size: usize,
22-
pub(crate) mid_size: usize,
2322
pub(crate) end_size: usize,
24-
pub(crate) use_inputs_at_offsets: bool,
2523
pub(crate) min_file_size_for_dl: usize,
2624
pub(crate) padding_token: i32,
2725
pub(crate) block_size: usize,
@@ -31,30 +29,18 @@ pub(crate) struct ModelConfig {
3129

3230
pub(crate) struct SplitFeatures<'a> {
3331
pub(crate) beg: &'a mut [i32],
34-
pub(crate) mid: &'a mut [i32],
3532
pub(crate) end: &'a mut [i32],
36-
pub(crate) off: Vec<(usize, &'a mut [i32])>,
3733
}
3834

3935
impl ModelConfig {
4036
pub(crate) fn features_size(&self) -> usize {
41-
let offsets_size = if self.use_inputs_at_offsets { 4 * 8 } else { 0 };
42-
self.beg_size + self.mid_size + self.end_size + offsets_size
37+
self.beg_size + self.end_size
4338
}
4439

4540
pub(crate) fn split_features<'a>(&self, features: &'a mut [i32]) -> SplitFeatures<'a> {
4641
let (beg, features) = features.split_at_mut(self.beg_size);
47-
let (mid, features) = features.split_at_mut(self.mid_size);
48-
let (end, mut features) = features.split_at_mut(self.end_size);
49-
let mut off = Vec::new();
50-
if self.use_inputs_at_offsets {
51-
for offset in [0x8000, 0x8800, 0x9000, 0x9800] {
52-
let (head, tail) = features.split_at_mut(8);
53-
features = tail;
54-
off.push((offset, head));
55-
}
56-
}
42+
let (end, features) = features.split_at_mut(self.end_size);
5743
debug_assert!(features.is_empty());
58-
SplitFeatures { beg, mid, end, off }
44+
SplitFeatures { beg, end }
5945
}
6046
}

rust/lib/src/input.rs

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ async fn extract_features_async(
151151
config: &ModelConfig, mut file: impl AsyncInputApi, file_len: usize,
152152
) -> Result<(Vec<u8>, Vec<i32>)> {
153153
debug_assert!(config.beg_size < config.block_size);
154-
debug_assert!(config.mid_size < config.block_size);
155154
debug_assert!(config.end_size < config.block_size);
156155
let buffer_size = std::cmp::min(config.block_size, file_len);
157156
let mut content_beg = vec![0; buffer_size];
@@ -160,31 +159,18 @@ async fn extract_features_async(
160159
let mut end = vec![0; buffer_size];
161160
file.read_at(&mut end, file_len - buffer_size).await?;
162161
let end = strip_suffix(&end);
163-
let mid_len = std::cmp::min(config.mid_size, file_len);
164-
let mid_off = (file_len - mid_len) / 2;
165-
let mut mid = vec![0; mid_len];
166-
file.read_at(&mut mid, mid_off).await?;
167162
let mut features = vec![config.padding_token; config.features_size()];
168163
let split_features = config.split_features(&mut features);
169164
copy_features(split_features.beg, beg, 0);
170-
copy_features(split_features.mid, &mid, 1);
171-
copy_features(split_features.end, end, 2);
172-
for (offset, features) in split_features.off {
173-
let mut buffer = Vec::new();
174-
if offset + features.len() <= file_len {
175-
buffer = vec![0; features.len()];
176-
file.read_at(&mut buffer, offset).await?;
177-
}
178-
copy_features(features, &buffer, 0);
179-
}
165+
copy_features(split_features.end, end, 1);
180166
Ok((content_beg, features))
181167
}
182168

183169
fn copy_features(dst: &mut [i32], src: &[u8], align: usize) {
184170
let len = std::cmp::min(dst.len(), src.len());
185171
let dst_len = dst.len(); // borrowing issue: cannot inline below
186-
let dst = &mut dst[(dst_len - len) * align / 2..][..len];
187-
let src = &src[(src.len() - len) * align / 2..][..len];
172+
let dst = &mut dst[(dst_len - len) * align..][..len];
173+
let src = &src[(src.len() - len) * align..][..len];
188174
for (dst, src) in dst.iter_mut().zip(src.iter()) {
189175
*dst = *src as i32;
190176
}
@@ -272,23 +258,23 @@ mod tests {
272258
GzDecoder::new(File::open(PATH).unwrap()).read_to_string(&mut tests).unwrap();
273259
let tests: Vec<Test> = serde_json::from_str(&tests).unwrap();
274260
for test in tests {
261+
assert_eq!(test.args.mid_size, 0, "unsupported mid_size");
262+
assert!(!test.args.use_inputs_at_offsets, "unsupported use_inputs_at_offsets");
263+
assert!(test.features.mid.is_empty(), "unsupported mid");
264+
assert!(test.features.offset_0x8000_0x8007.is_empty(), "unsupported offset");
265+
assert!(test.features.offset_0x8800_0x8807.is_empty(), "unsupported offset");
266+
assert!(test.features.offset_0x9000_0x9007.is_empty(), "unsupported offset");
267+
assert!(test.features.offset_0x9800_0x9807.is_empty(), "unsupported offset");
275268
let config = ModelConfig {
276269
beg_size: test.args.beg_size,
277-
mid_size: test.args.mid_size,
278270
end_size: test.args.end_size,
279-
use_inputs_at_offsets: test.args.use_inputs_at_offsets,
280271
padding_token: test.args.padding_token,
281272
block_size: test.args.block_size,
282273
..crate::model::CONFIG
283274
};
284275
let mut expected = Vec::new();
285276
expected.extend_from_slice(&test.features.beg);
286-
expected.extend_from_slice(&test.features.mid);
287277
expected.extend_from_slice(&test.features.end);
288-
expected.extend_from_slice(&test.features.offset_0x8000_0x8007);
289-
expected.extend_from_slice(&test.features.offset_0x8800_0x8807);
290-
expected.extend_from_slice(&test.features.offset_0x9000_0x9007);
291-
expected.extend_from_slice(&test.features.offset_0x9800_0x9807);
292278
let content = BASE64.decode(test.content_base64.as_bytes()).unwrap();
293279
let actual = extract_features_async(&config, content.as_slice(), content.len());
294280
let actual = exec(actual).unwrap().1;

rust/lib/src/model.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@ use crate::ContentType;
2222

2323
pub(crate) const CONFIG: ModelConfig = ModelConfig {
2424
beg_size: 1024,
25-
mid_size: 0,
2625
end_size: 1024,
27-
use_inputs_at_offsets: false,
2826
min_file_size_for_dl: 8,
2927
padding_token: 256,
3028
block_size: 4096,

0 commit comments

Comments
 (0)