Skip to content

Commit 50490b8

Browse files
86xskelijah-potter
andauthored
feat: create harper-thesaurus (#2085)
* feat: create `harper-thesaurus` * chore(core): add `harper-thesaurus` as opt. dep. Add `harper-thesaurus` as optional dependency. * chore(ls): re-export `harper-core/thesaurus` * refactor(core): make `Suggestion` more flexible * feat(core): add module `thesaurus_helper` Add a module that makes it easier to use the thesaurus from `harper-core`. * feat(core): provide synonyms in `BoringWords` lint * feat(thesaurus): sort synonyms by word frequency * perf(thesaurus): use `HashMap` to avoid O(n) * feat(core): add `DictWordMetadata::difference` * feat(thesaurus): sort by `TokenKind` similarity Sort by `TokenKind` similarity in addition to sorting by word frequency. * test(thesaurus): remove unnecessary test * fix: post-merge fixes, ensuring tests pass * fix(thesaurus): update function name * feat(thesaurus): compress thesaurus via zstd Use zstd to compress the thesaurus at build time; decompress at runtime. Also rework the thesaurus code to work with these new changes, along the way trying to prioritize simplicity over premature optimization. * chore(thesaurus): update version number * chore: fix dead code warnings * fix(thesaurus): correct condition in build script * feat(thesaurus): filter out thesaurus entries Reduce thesaurus size by filtering out entries for words that don't exist in the curated dictionary. * docs(thesaurus): minor fixes and adjustments * chore(thesaurus): move Clippy attribute * deps(core): enable `thesaurus` by default * fix: excessive pruning of thesaurus Remove code related to pruning thesaurus to only those entries that are defined in the curated dictionary. * fix(core): improve `DictWordMetadata::difference` Generate the `difference` function using a macro, to ensure no relevant functions used for calculating the difference are missed. Also use a larger integer type for the return value to reduce the chance of accidental overflows in the future. * chore(thesaurus): update version number * fix: infinite loop in `replace_with_match_case()` Create `harper_core::case` to contain functionality related to character casing. Move and reimplement case-copying functionality in `replace_with_match_case` to `harper_core::case::copy_casing`. * fix(web): add `clang` as a dependency * fix(web): reduce CI disk usage by cleaning up build dir * fix(web): do not cache intermediate layers to free up space --------- Co-authored-by: Elijah Potter <me@elijahpotter.dev>
1 parent 8aa5190 commit 50490b8

20 files changed

Lines changed: 52978 additions & 29 deletions

.github/workflows/build_web.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ jobs:
2222
- uses: redhat-actions/buildah-build@v2
2323
with:
2424
image: web
25-
layers: true
2625
containerfiles: |
2726
Dockerfile
2827
build-args: |

Cargo.lock

Lines changed: 39 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[workspace]
2-
members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink", "harper-python", "harper-jjdescription", "harper-asciidoc", "fuzz"]
2+
members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink", "harper-python", "harper-jjdescription", "harper-thesaurus", "harper-asciidoc", "fuzz"]
33
resolver = "2"
44

55
[profile.test]

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ ARG NODE_VERSION=24
55

66
FROM rust:latest AS wasm-build
77
RUN rustup toolchain install
8+
RUN apt-get update -y && apt-get install clang -y
89

910
RUN mkdir -p /usr/build/
1011
WORKDIR /usr/build/
@@ -15,6 +16,7 @@ COPY . .
1516

1617
WORKDIR /usr/build/harper-wasm
1718
RUN wasm-pack build --target web
19+
RUN cargo clean
1820

1921
FROM node:${NODE_VERSION} AS node-build
2022

harper-core/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ strum_macros = "0.27.2"
3232
strum = "0.27.2"
3333
ammonia = "4.1.2"
3434
harper-brill = { path = "../harper-brill", version = "1.0.0" }
35+
harper-thesaurus = { path = "../harper-thesaurus", version = "1.4.1", optional = true }
3536
bitflags = { version = "2.10.0", features = ["serde"] }
3637
trie-rs = "0.4.2"
3738

@@ -48,5 +49,6 @@ name = "parse_essay"
4849
harness = false
4950

5051
[features]
51-
default = []
52+
default = ["thesaurus"]
5253
concurrent = []
54+
thesaurus = ["dep:harper-thesaurus"]

harper-core/src/case.rs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
use std::borrow::Borrow;
2+
3+
use smallvec::SmallVec;
4+
5+
use crate::CharString;
6+
7+
/// Apply the casing of `template` to `target`.
8+
///
9+
/// If `template` is shorter than `target`, the casing of the last character of `template` will be reused for
10+
/// the rest of the string.
11+
///
12+
/// If `template` is empty, all characters will be lowercased.
13+
#[must_use]
14+
pub fn copy_casing(
15+
template: impl IntoIterator<Item = impl Borrow<char>>,
16+
target: impl IntoIterator<Item = impl Borrow<char>>,
17+
) -> CharString {
18+
target
19+
.into_iter()
20+
.scan(
21+
(template.into_iter().get_casing(), Case::Lower),
22+
|(template, prev_case), c| {
23+
// Skip non-alphabetic characters in `target` without advancing `template`.
24+
if c.borrow().is_alphabetic()
25+
&& let Some(template_case) = template.next()
26+
{
27+
*prev_case = template_case;
28+
};
29+
Some(prev_case.apply_to(*c.borrow()))
30+
},
31+
)
32+
.flatten()
33+
.collect()
34+
}
35+
36+
/// Represents the casing of a character.
37+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38+
pub enum Case {
39+
Upper,
40+
Lower,
41+
}
42+
43+
impl Case {
44+
/// Apply the casing to a provided character.
45+
///
46+
/// This essentially calls [`char::to_uppercase()`] or [`char::to_lowercase()`] depending on
47+
/// the state of `self`. Similarly to those functions, it returns an iterator of the resulting
48+
/// character(s).
49+
pub fn apply_to(&self, char: char) -> impl Iterator<Item = char> + use<> {
50+
match self {
51+
Self::Upper => char.to_uppercase().collect::<SmallVec<[char; 2]>>(),
52+
Self::Lower => char.to_lowercase().collect::<SmallVec<[char; 2]>>(),
53+
}
54+
.into_iter()
55+
}
56+
}
57+
58+
impl TryFrom<char> for Case {
59+
type Error = ();
60+
61+
/// Try to get the casing from the given character.
62+
///
63+
/// This fails if the character is neither uppercase nor lowercase.
64+
fn try_from(value: char) -> Result<Self, Self::Error> {
65+
if value.is_uppercase() {
66+
Ok(Self::Upper)
67+
} else if value.is_lowercase() {
68+
Ok(Self::Lower)
69+
} else {
70+
Err(())
71+
}
72+
}
73+
}
74+
75+
// TODO: maybe move this functionality to CharStringExt if and when CharStringExt can be
76+
// generalized to work with char iterators.
77+
pub trait CaseIterExt {
78+
fn get_casing(self) -> impl Iterator<Item = Case>;
79+
}
80+
impl<I: IntoIterator<Item = T>, T: Borrow<char>> CaseIterExt for I {
81+
/// Get an iterator of [`Case`] from a collection of characters. Note that this will not
82+
/// include cases for characters that are neither uppercase nor lowercase.
83+
fn get_casing(self) -> impl Iterator<Item = Case> {
84+
self.into_iter()
85+
.filter_map(|char| (*char.borrow()).try_into().ok())
86+
}
87+
}

harper-core/src/dict_word_metadata.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,21 @@ macro_rules! generate_metadata_queries {
7979
)*].iter().map(|b| *b as u8).sum::<u8>() > 1
8080
}
8181

82+
/// How different is this word from another?
83+
pub fn difference(&self, other: &Self) -> u32 {
84+
[
85+
$(
86+
Self::[< is_ $category >],
87+
$(
88+
Self::[< is_ $sub _ $category >],
89+
Self::[< is_non_ $sub _ $category >],
90+
)*
91+
)*
92+
]
93+
.iter()
94+
.fold(0, |acc, func| acc + (func(self) ^ func(other)) as u32)
95+
}
96+
8297
$(
8398
#[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")]
8499
pub fn [< is_ $category >](&self) -> bool {

harper-core/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#![doc = include_str!("../README.md")]
22
#![allow(dead_code)]
33

4+
pub mod case;
45
mod char_ext;
56
mod char_string;
67
mod currency;
@@ -25,6 +26,7 @@ mod render_markdown;
2526
mod span;
2627
pub mod spell;
2728
mod sync;
29+
mod thesaurus_helper;
2830
mod title_case;
2931
mod token;
3032
mod token_kind;
@@ -35,6 +37,7 @@ pub mod weir;
3537
use render_markdown::render_markdown;
3638
use std::collections::{BTreeMap, VecDeque};
3739

40+
pub use case::{Case, CaseIterExt};
3841
pub use char_string::{CharString, CharStringExt};
3942
pub use currency::Currency;
4043
pub use dict_word_metadata::{

harper-core/src/linting/boring_words.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
use itertools::Itertools;
2+
13
use crate::expr::{Expr, WordExprGroup};
4+
use crate::thesaurus_helper;
25
use crate::{Token, TokenStringExt};
36

47
use super::{ExprLinter, Lint, LintKind};
@@ -37,7 +40,12 @@ impl ExprLinter for BoringWords {
3740
Some(Lint {
3841
span: matched_tokens.span()?,
3942
lint_kind: LintKind::Enhancement,
40-
suggestions: vec![],
43+
suggestions: thesaurus_helper::get_synonym_replacement_suggestions(
44+
&matched_word,
45+
&matched_tokens[0].kind,
46+
)
47+
.take(5)
48+
.collect_vec(),
4149
message: format!(
4250
"“{matched_word}” is a boring word. Try something a little more exotic."
4351
),

harper-core/src/linting/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ pub use lint_kind::LintKind;
231231
pub use map_phrase_linter::MapPhraseLinter;
232232
pub use map_phrase_set_linter::MapPhraseSetLinter;
233233
pub use spell_check::SpellCheck;
234-
pub use suggestion::Suggestion;
234+
pub use suggestion::{Suggestion, SuggestionCollectionExt};
235235

236236
use crate::{Document, LSend, render_markdown};
237237

0 commit comments

Comments
 (0)