Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
88 commits
Select commit Hold shift + click to select a range
008c534
test(core): add failing test
86xsk Jan 20, 2026
a5f4a1b
test(comments): don't expect 'lin' to be marked as a spelling error
86xsk Jan 28, 2026
5face23
test(core): move tests
86xsk Jan 28, 2026
fe37ba6
test(core): don't expect `SpellCheck` to mark capitalization issues
86xsk Jan 29, 2026
6f02acd
deps(core): add `indexmap`
86xsk Jan 29, 2026
03b1f5b
feat(core)!: more explicit handling of case-sensitivity in dictionaries
86xsk Jan 29, 2026
ce666c5
chore: update snapshots
86xsk Jan 29, 2026
4443a4e
Partially revert "fix(core): PR getting flagged as 'misspelled' (#2476)"
86xsk Jan 29, 2026
e982f23
test(core): merge tests and add test
86xsk Jan 29, 2026
5688aee
Merge branch 'master' into fix-dict-casing2
86xsk Jan 29, 2026
0a1e2d4
test(core): move test
86xsk Jan 30, 2026
cf9a90a
fix(core): fix logic in `OrthographicConsistency`
86xsk Jan 30, 2026
007df6e
test(core): add failing test
86xsk Jan 30, 2026
7518350
fix(core): allow all case-variants in `OrthographicConsistency`
86xsk Jan 30, 2026
3c9d54e
test(core): remove Lego -> LEGO test in `OrthographicConsistency`
86xsk Jan 30, 2026
8b426d9
chore: update snapshots
86xsk Jan 30, 2026
3381fba
test(core): add test
86xsk Jan 30, 2026
e11a2d6
test(core): fix incorrect test expectation
86xsk Jan 30, 2026
b23f652
refactor(core): appease Clippy
86xsk Jan 30, 2026
3f068bb
feat(core): support multiple `derived_from`
86xsk Feb 1, 2026
18ba296
perf(core): reduce Vec cloning
86xsk Feb 2, 2026
aeba563
refactor(core): reuse code from similar function
86xsk Feb 2, 2026
f30cfff
Merge branch 'master' into fix-dict-casing2
86xsk Feb 3, 2026
32ce68c
refactor(core): remove dead code
86xsk Feb 3, 2026
5bd11c1
Merge branch 'master' into fix-dict-casing2
86xsk Feb 3, 2026
a9d3f75
Merge branch 'master' into fix-dict-casing2
86xsk Feb 7, 2026
a4709d6
Merge branch 'master' into fix-dict-casing2
86xsk Feb 11, 2026
028a39b
Merge branch 'master' into fix-dict-casing2
86xsk Feb 12, 2026
0a8a93a
fix(core): suggest "need" for "ned"
86xsk Feb 12, 2026
bcfea8f
fix(core): make `SpellCheck` case-sensitive again
86xsk Feb 13, 2026
57c8562
Revert "test(comments): don't expect 'lin' to be marked as a spelling…
86xsk Feb 13, 2026
9313f00
Revert "test(core): don't expect `SpellCheck` to mark capitalization …
86xsk Feb 13, 2026
11842a3
refactor(core): split word ID structs into separate files
86xsk Feb 13, 2026
34462b8
docs(core): fix grammar
86xsk Feb 13, 2026
1a53a39
refactor(core): make `WordIdPair` `pub(crate)`
86xsk Feb 13, 2026
1053d6f
style(core): fix whitespace in `dictionary.rs`
86xsk Feb 13, 2026
a77437d
perf(core): add early exits for URL lexing
86xsk Feb 14, 2026
7f93814
perf(core): early exit in `lex_email_address`
86xsk Feb 14, 2026
e7d9c85
refactor(core): simplify code
86xsk Feb 14, 2026
acde3e5
style(core): reorder imports
86xsk Feb 14, 2026
a5196da
perf(core): cache `WordSet` in `ModalVerb`
86xsk Feb 15, 2026
8c64ce3
refactor(core): replace `ModalVerb::init` function
86xsk Feb 15, 2026
092c5b4
Merge branch 'master' into fix-dict-casing2
86xsk Feb 16, 2026
8a1e0da
Merge branch 'master' into fix-dict-casing2
86xsk Feb 17, 2026
c72cb68
Merge branch 'master' into fix-dict-casing2
86xsk Feb 19, 2026
f6b6fa4
refactor(core): default impls for `Dictionary` str fns
86xsk Feb 17, 2026
9ba8fce
refactor(core)!: return `WordMapEntry` from `Dictionary`
86xsk Feb 17, 2026
db7e0e2
refactor(core): avoid unnecessary cloning
86xsk Feb 17, 2026
44aea1e
refactor(core): rename `get_correct_capitalization_of`
86xsk Feb 17, 2026
d30db60
refactor(core): default impl for `get_correct_capitalizations_of`
86xsk Feb 18, 2026
ce349b2
refactor(core): de-Arc `MutableDictionary::curated`
86xsk Feb 18, 2026
2d4c501
refactor(core): de-Arc `FstDictionary::curated`
86xsk Feb 19, 2026
68449db
refactor: take argument by value instead of mut ref
86xsk Feb 20, 2026
85f04d7
refactor!: remove pointless `Box` in `CollapseIdentifiers::new`
86xsk Feb 20, 2026
63368ef
refactor!: don't refcount/`thread_local!` read-only statics
86xsk Feb 20, 2026
ea05ae2
perf: use `dyn` in place of `impl`
86xsk Feb 20, 2026
823d21e
docs(core): add documentation for `WordMap`
86xsk Feb 20, 2026
602cc10
refactor(core): move curated dictionary init to `word_map`
86xsk Feb 20, 2026
777de5d
refactor(core): remove pointless Arc in `FstDictionary`
86xsk Feb 21, 2026
4144cae
refactor(core): rename `word_map` to `fst_map` in `FstDictionary`
86xsk Feb 21, 2026
1fc8691
style(core): rearrange lines
86xsk Feb 21, 2026
2c5673a
refactor(core): remove unused argument/member
86xsk Feb 21, 2026
fd71cab
refactor(core): impl `Dictionary` for `WordMap`
86xsk Feb 21, 2026
df628c4
refactor: remove redundant `self::` in paths
86xsk Feb 21, 2026
7bde37d
feat(core): add `WordMap::curated`
86xsk Feb 21, 2026
e772f86
refactor(core): `WordMap` instead of `FstDictionary` in `MergeableWords`
86xsk Feb 21, 2026
aba2cd8
perf(core): avoid conversion between string and char array
86xsk Feb 21, 2026
2046a6f
perf(core): specialize `get_word_metadata_combined` for `WordMap`
86xsk Feb 21, 2026
2e75187
feat(core): create `WordMap::is_empty`
86xsk Feb 21, 2026
f75ac16
refactor(core): use `WordMap` in more places
86xsk Feb 21, 2026
76069db
refactor(core): fix inconsistent casing
86xsk Feb 21, 2026
7a8b43b
feat(core): create `Dictionary::get_word_map`
86xsk Feb 21, 2026
3a9c4bb
refactor(core)!: create `CommonDictFuncs`
86xsk Feb 21, 2026
516110f
refactor(core): avoid generics and use `WordMap` in more places
86xsk Feb 22, 2026
0605d95
refactor(core): fix warning by removing pointless borrow
86xsk Feb 22, 2026
5770756
refactor(core): remove `MutableDictionary`; alias as `WordMap`
86xsk Feb 22, 2026
727ebde
feat(core)!: create `WordMap::to_fst`
86xsk Feb 22, 2026
3e14c4a
refactor(core): `impl Extend<WordMapEntry> for WordMap`
86xsk Feb 22, 2026
99cb6d1
refactor(core): move `WordMapEntry` to its own module
86xsk Feb 22, 2026
d969a77
refactor(core): absorb `MutableDictionary` functions into `WordMap`
86xsk Feb 22, 2026
1831ccf
refactor(core): clean up code in `FstDictionary`
86xsk Feb 22, 2026
48df22f
feat(core): add std trait impls for `WordMap`
86xsk Feb 22, 2026
6d7e5ff
perf(core): avoid storing duplicated data in `FstDictionary`
86xsk Feb 22, 2026
9b84c61
perf(core)!: change `FstDictionary::new` to take `WordMap`
86xsk Feb 22, 2026
cb69409
refactor(core): remove unused import
86xsk Feb 22, 2026
0f18056
style(core): run `cargo fmt`
86xsk Feb 23, 2026
207a041
Merge branch 'master' into fix-dict-casing2
86xsk Feb 23, 2026
9493c16
Merge branch 'fix-dict-casing2' into fix-dict-casing2-refactor-dictio…
86xsk Feb 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions harper-asciidoc/tests/asciidoc_tests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::sync::Arc;

use harper_asciidoc::AsciidocParser;
use harper_core::linting::{LintGroup, Linter};
use harper_core::spell::FstDictionary;
use harper_core::spell::WordMap;
use harper_core::{Dialect, Document};

/// Creates a unit test checking Asciidoc source code parsing.
Expand All @@ -18,7 +20,7 @@ macro_rules! create_test {
);

let parser = AsciidocParser::default();
let dict = FstDictionary::curated();
let dict = Arc::new(WordMap::curated());
let document = Document::new(&source, &parser, &dict);

let mut linter = LintGroup::new_curated(dict, Dialect::American);
Expand Down
16 changes: 7 additions & 9 deletions harper-brill/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,39 +1,37 @@
use std::num::NonZero;
use std::rc::Rc;
use std::sync::{Arc, LazyLock};
use std::sync::LazyLock;

pub use harper_pos_utils::{
BrillChunker, BrillTagger, BurnChunkerCpu, CachedChunker, Chunker, FreqDict, Tagger, UPOS,
};

const BRILL_TAGGER_SOURCE: &str = include_str!("../trained_tagger_model.json");

static BRILL_TAGGER: LazyLock<Arc<BrillTagger<FreqDict>>> =
LazyLock::new(|| Arc::new(uncached_brill_tagger()));
static BRILL_TAGGER: LazyLock<BrillTagger<FreqDict>> = LazyLock::new(uncached_brill_tagger);

fn uncached_brill_tagger() -> BrillTagger<FreqDict> {
serde_json::from_str(BRILL_TAGGER_SOURCE).unwrap()
}

/// Get a copy of a shared, lazily-initialized [`BrillTagger`]. There will be only one instance
/// per-process.
pub fn brill_tagger() -> Arc<BrillTagger<FreqDict>> {
(*BRILL_TAGGER).clone()
pub fn brill_tagger() -> &'static BrillTagger<FreqDict> {
&BRILL_TAGGER
}

const BRILL_CHUNKER_SOURCE: &str = include_str!("../trained_chunker_model.json");

static BRILL_CHUNKER: LazyLock<Arc<BrillChunker>> =
LazyLock::new(|| Arc::new(uncached_brill_chunker()));
static BRILL_CHUNKER: LazyLock<BrillChunker> = LazyLock::new(uncached_brill_chunker);

fn uncached_brill_chunker() -> BrillChunker {
serde_json::from_str(BRILL_CHUNKER_SOURCE).unwrap()
}

/// Get a copy of a shared, lazily-initialized [`BrillChunker`]. There will be only one instance
/// per-process.
pub fn brill_chunker() -> Arc<BrillChunker> {
(*BRILL_CHUNKER).clone()
pub fn brill_chunker() -> &'static BrillChunker {
&BRILL_CHUNKER
}

const BURN_CHUNKER_VOCAB: &[u8; 627993] = include_bytes!("../finished_chunker/vocab.json");
Expand Down
5 changes: 4 additions & 1 deletion harper-cli/src/input/single_input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ pub(crate) trait SingleInputTrait: InputTrait {
dictionary: &dyn Dictionary,
) -> anyhow::Result<(Document, Cow<'_, str>)> {
let text = self.get_content()?;
Ok((Document::new(&text, &parser, &dictionary), text))
Ok((
Document::new(&text, &parser, dictionary.get_word_map()),
text,
))
}

/// The parser that should be used to parse this input.
Expand Down
17 changes: 9 additions & 8 deletions harper-cli/src/lint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ use hashbrown::HashMap;
use rayon::prelude::*;

use harper_core::{
CharString, Dialect, Document, Token, TokenKind,
linting::{Lint, LintGroup, LintGroupConfig, LintKind},
parsers::MarkdownOptions,
spell::{Dictionary, MergedDictionary, MutableDictionary},
remove_overlaps_map,
spell::{Dictionary, MergedDictionary, MutableDictionary, WordMapEntry},
weirpack::Weirpack,
{Dialect, DictWordMetadata, Document, Token, TokenKind, remove_overlaps_map},
};

use crate::input::{
Expand All @@ -28,9 +29,9 @@ fn load_dict(path: &Path) -> anyhow::Result<MutableDictionary> {
let str = fs::read_to_string(path)?;

let mut dict = MutableDictionary::new();
dict.extend_words(
dict.extend(
str.lines()
.map(|l| (l.chars().collect::<Vec<_>>(), DictWordMetadata::default())),
.map(|l| WordMapEntry::new(l.chars().collect::<CharString>())),
);

Ok(dict)
Expand Down Expand Up @@ -111,7 +112,7 @@ impl InputInfo<'_> {

pub fn lint(
markdown_options: MarkdownOptions,
curated_dictionary: Arc<dyn Dictionary>,
curated_dictionary: &'static dyn Dictionary,
mut inputs: Vec<AnyInput>,
mut lint_options: LintOptions,
user_dict_path: PathBuf,
Expand Down Expand Up @@ -348,11 +349,11 @@ fn lint_one_input(
Err(err) => eprintln!("{}", err),
Ok((doc, source)) => {
// Create the Lint Group from which we will lint this input, using the combined dictionary and the specified dialect
let mut lint_group = LintGroup::new_curated(merged_dictionary.into(), *dialect);
let mut lint_group = LintGroup::new_curated(Arc::new(merged_dictionary), *dialect);

for pack in weirpacks {
let mut pack_group = pack.to_lint_group()?;
lint_group.merge_from(&mut pack_group);
let pack_group = pack.to_lint_group()?;
lint_group.merge_from(pack_group);
}

// Turn specified rules on or off
Expand Down
36 changes: 24 additions & 12 deletions harper-cli/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#![doc = include_str!("../README.md")]

use harper_core::spell::{Dictionary, FstDictionary, MutableDictionary, WordId};
use harper_core::spell::{
CanonicalWordId, CommonDictFuncs, Dictionary, FstDictionary, MutableDictionary, WordMap,
};
use hashbrown::HashMap;
use std::collections::BTreeMap;
use std::fs::File;
use std::io::BufReader;
use std::path::PathBuf;
// use std::sync::Arc;
use std::sync::Arc;
use std::{fs, process};

use anyhow::anyhow;
Expand Down Expand Up @@ -382,7 +384,9 @@ fn main() -> anyhow::Result<()> {
];

for word in words {
let meta = curated_dictionary.get_word_metadata_str(&word);
let meta = curated_dictionary
.get_word_exact_str(&word)
.map(|word| &word.metadata);
let (flags, emojis) = meta.as_ref().map_or_else(
|| (String::new(), String::new()),
|md| {
Expand Down Expand Up @@ -460,7 +464,7 @@ fn main() -> anyhow::Result<()> {

if let Some((dict_word, dict_annot)) = &entry_in_dict {
println!("Old, from the dictionary:");
print_word_derivations(dict_word, dict_annot, &FstDictionary::curated());
print_word_derivations(dict_word, dict_annot, WordMap::curated());
};

if !annot.is_empty() {
Expand All @@ -471,7 +475,7 @@ fn main() -> anyhow::Result<()> {
)?;

println!("New, from you:");
print_word_derivations(&word, &annot, &dict);
print_word_derivations(&word, &annot, dict.get_word_map());
}

Ok(())
Expand All @@ -483,7 +487,7 @@ fn main() -> anyhow::Result<()> {
description: String,
}

let linter = LintGroup::new_curated(curated_dictionary, Dialect::American);
let linter = LintGroup::new_curated(Arc::new(curated_dictionary), Dialect::American);

let default_config: HashMap<String, bool> =
serde_json::from_str(&serde_json::to_string(&linter.config).unwrap()).unwrap();
Expand Down Expand Up @@ -878,7 +882,10 @@ fn main() -> anyhow::Result<()> {
let mut processed_words = HashMap::new();
let mut longest_word = 0;
for word in curated_dictionary.words_iter() {
if let Some(metadata) = curated_dictionary.get_word_metadata(word) {
if let Some(metadata) = curated_dictionary
.get_word_exact(word)
.map(|word| &word.metadata)
{
let orth = metadata.orth_info;
let bits = orth.bits() & case_bitmask.bits();

Expand Down Expand Up @@ -994,14 +1001,19 @@ fn line_to_parts(line: &str) -> (String, String) {
}
}

fn print_word_derivations(word: &str, annot: &str, dictionary: &impl Dictionary) {
fn print_word_derivations(word: &str, annot: &str, dictionary: &WordMap) {
println!("{word}/{annot}");

let id = WordId::from_word_str(word);
let id = CanonicalWordId::from_word_str(word);

let children = dictionary
.words_iter()
.filter(|e| dictionary.get_word_metadata(e).unwrap().derived_from == Some(id));
let children = dictionary.words_iter().filter(|e| {
dictionary
.get_word_exact(e)
.unwrap()
.metadata
.derived_from
.contains(id)
});

println!(" - {word}");

Expand Down
5 changes: 3 additions & 2 deletions harper-comments/tests/language_support.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::path::Path;
use std::sync::Arc;

use harper_comments::CommentParser;
use harper_core::linting::{LintGroup, Linter};
use harper_core::parsers::MarkdownOptions;
use harper_core::spell::FstDictionary;
use harper_core::spell::WordMap;
use harper_core::{Dialect, Document};

/// Creates a unit test checking that the linting of a source file in
Expand All @@ -23,7 +24,7 @@ macro_rules! create_test {
);

let parser = CommentParser::new_from_filename(Path::new(filename), MarkdownOptions::default()).unwrap();
let dict = FstDictionary::curated();
let dict = Arc::new(WordMap::curated());
let document = Document::new(&source, &parser, &dict);

let mut linter = LintGroup::new_curated(dict, Dialect::American);
Expand Down
1 change: 1 addition & 0 deletions harper-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ harper-brill = { path = "../harper-brill", version = "1.0.0" }
harper-thesaurus = { path = "../harper-thesaurus", version = "1.4.1", optional = true }
bitflags = { version = "2.11.0", features = ["serde"] }
trie-rs = "0.4.2"
indexmap = "2.12.1"
zip = { version = "8.0.0", default-features = false, features = ["deflate"] }
regex = "1.12.3"

Expand Down
4 changes: 3 additions & 1 deletion harper-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ If you would prefer to run Harper from inside a JavaScript runtime, [we have a p
Here's what a full end-to-end linting pipeline could look like using `harper-core`.

```rust
use std::sync::Arc;

use harper_core::linting::{LintGroup, Linter};
use harper_core::parsers::PlainEnglish;
use harper_core::spell::FstDictionary;
Expand All @@ -26,7 +28,7 @@ let parser = PlainEnglish;

let document = Document::new_curated(text, &parser);

let dict = FstDictionary::curated();
let dict = Arc::new(FstDictionary::curated());
let mut linter = LintGroup::new_curated(dict, Dialect::American);

let lints = linter.lint(&document);
Expand Down
14 changes: 9 additions & 5 deletions harper-core/benches/parse_essay.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use std::hint::black_box;
use std::sync::Arc;

use criterion::{Criterion, criterion_group, criterion_main};

use harper_core::linting::{LintGroup, Linter};
use harper_core::spell::FstDictionary;
use harper_core::spell::{Dictionary, FstDictionary};
use harper_core::{Dialect, Document};
use std::hint::black_box;

static ESSAY: &str = include_str!("./essay.md");

Expand All @@ -13,7 +16,7 @@ fn parse_essay(c: &mut Criterion) {
}

fn lint_essay(c: &mut Criterion) {
let dictionary = FstDictionary::curated();
let dictionary = Arc::new(FstDictionary::curated());
let mut lint_set = LintGroup::new_curated(dictionary, Dialect::American);
let document = Document::new_markdown_default_curated(black_box(ESSAY));

Expand All @@ -25,9 +28,10 @@ fn lint_essay(c: &mut Criterion) {
fn lint_essay_uncached(c: &mut Criterion) {
c.bench_function("lint_essay_uncached", |b| {
b.iter(|| {
let dictionary = FstDictionary::curated();
let dictionary = Arc::new(FstDictionary::curated());
let mut lint_set = LintGroup::new_curated(dictionary.clone(), Dialect::American);
let document = Document::new_markdown_default(black_box(ESSAY), &dictionary);
let document =
Document::new_markdown_default(black_box(ESSAY), dictionary.get_word_map());
lint_set.lint(&document)
})
});
Expand Down
1 change: 1 addition & 0 deletions harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -8274,6 +8274,7 @@ PowerPoint/ONgV
Powers/NOg
Powhatan/NOg
Poznan/Og
Pr/ # Praseodymium
Prada/g
Prado/Og
Praetorian/Ng
Expand Down
Loading
Loading