Skip to content

Commit c83f65b

Browse files
authored
Allow language models directory to be specified (#9)
This commit allows a language models directory to be specified from which to load language models. By default, language models will be loaded from the `Lingua/LanguageModels` directory, relative to `AppContext.BaseDirectory`.
1 parent bd42b02 commit c83f65b

4 files changed

Lines changed: 170 additions & 38 deletions

File tree

src/Lingua/LanguageDetector.cs

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
namespace Lingua;
99

1010
/// <summary>
11-
/// Detects language of given input text, and computes confidence values for every language considered possible
12-
/// for given input text.
11+
/// Detects language of a given input text and computes confidence values for every language considered possible
12+
/// for the given input text.
1313
/// </summary>
1414
public sealed partial class LanguageDetector
1515
{
@@ -85,6 +85,7 @@ public sealed partial class LanguageDetector
8585
private readonly bool _isLowAccuracyModeEnabled;
8686
private readonly Dictionary<Alphabet, Language> _oneLanguageAlphabets;
8787
private readonly IEnumerable<Language> _languagesWithUniqueCharacters;
88+
private readonly string _languageModelsDirectory;
8889

8990
private static readonly int[] LowAccuracyRange = [3];
9091
private static readonly int[] HighAccuracyRange = [1, 2, 3, 4, 5];
@@ -95,14 +96,16 @@ internal LanguageDetector(
9596
HashSet<Language> languages,
9697
double minimumRelativeDistance = 0,
9798
bool isEveryLanguageModelPreloaded = false,
98-
bool isLowAccuracyModeEnabled = false)
99+
bool isLowAccuracyModeEnabled = false,
100+
string? languageModelsDirectory = null)
99101
{
100102
_languages = languages;
101103
_minimumRelativeDistance = minimumRelativeDistance;
102104
_isLowAccuracyModeEnabled = isLowAccuracyModeEnabled;
103105
_oneLanguageAlphabets = AlphabetExtensions.AllSupportingExactlyOneLanguage()
104106
.Where(a => languages.Contains(a.Value)).ToDictionary();
105107
_languagesWithUniqueCharacters = languages.Where(l => !string.IsNullOrWhiteSpace(l.UniqueCharacters()));
108+
_languageModelsDirectory = languageModelsDirectory ?? Path.Combine(AppContext.BaseDirectory, "Lingua", "LanguageModels");
106109

107110
if (isEveryLanguageModelPreloaded)
108111
PreloadLanguageModels();
@@ -195,13 +198,13 @@ public IDictionary<Language, double> ComputeLanguageConfidenceValues(string text
195198
Parallel.ForEach(ngramSizeRange, i =>
196199
{
197200
var testDataModel = TestLanguageModel.FromText(cleanedUpText, i);
198-
var probabilities = ComputeLanguageProbabilities(testDataModel, filteredLanguages);
201+
var probabilities = ComputeLanguageProbabilities(testDataModel, filteredLanguages, _languageModelsDirectory);
199202
if (i == 1)
200203
{
201204
var unigramFilteredLanguages = _languages.Count > 0
202205
? filteredLanguages.Where(f => _languages.Contains(f)).ToHashSet()
203206
: filteredLanguages;
204-
unigramCounts = CountUnigrams(testDataModel, unigramFilteredLanguages);
207+
unigramCounts = CountUnigrams(testDataModel, unigramFilteredLanguages, _languageModelsDirectory);
205208
}
206209
allProbabilities[i - startValue] = probabilities;
207210
});
@@ -314,14 +317,14 @@ private static Dictionary<Language, double> SumUpProbabilities(
314317
return summedUpProbabilities;
315318
}
316319

317-
private static Dictionary<Language, int> CountUnigrams(TestLanguageModel unigramLanguageModel, HashSet<Language> filteredLanguages)
320+
private static Dictionary<Language, int> CountUnigrams(TestLanguageModel unigramLanguageModel, HashSet<Language> filteredLanguages, string languageModelDirectory)
318321
{
319322
var unigramCounts = new Dictionary<Language, int>();
320323
foreach (var language in filteredLanguages)
321324
{
322325
foreach (var unigram in unigramLanguageModel.Ngrams)
323326
{
324-
var probability = LookupNgramProbability(language, unigram.AsSpan());
327+
var probability = LookupNgramProbability(language, unigram.AsSpan(), languageModelDirectory);
325328
if (probability > 0)
326329
unigramCounts.IncrementCounter(language);
327330
}
@@ -330,27 +333,27 @@ private static Dictionary<Language, int> CountUnigrams(TestLanguageModel unigram
330333
return unigramCounts;
331334
}
332335

333-
internal static Dictionary<Language, double> ComputeLanguageProbabilities(TestLanguageModel testModel, IReadOnlySet<Language> filteredLanguages)
336+
internal static Dictionary<Language, double> ComputeLanguageProbabilities(TestLanguageModel testModel, IReadOnlySet<Language> filteredLanguages, string languageModelDirectory)
334337
{
335338
var probabilities = new Dictionary<Language, double>();
336339
foreach (var language in filteredLanguages)
337340
{
338-
var sum = ComputeSumOfNgramProbabilities(language, testModel.Ngrams);
341+
var sum = ComputeSumOfNgramProbabilities(language, testModel.Ngrams, languageModelDirectory);
339342
if (sum < 0)
340343
probabilities[language] = sum;
341344
}
342345

343346
return probabilities;
344347
}
345348

346-
internal static double ComputeSumOfNgramProbabilities(Language language, HashSet<Ngram> ngrams)
349+
internal static double ComputeSumOfNgramProbabilities(Language language, HashSet<Ngram> ngrams, string languageModelDirectory)
347350
{
348351
var sum = 0d;
349352
foreach (var ngram in ngrams)
350353
{
351354
foreach (var elem in ngram.LowerOrderNGrams())
352355
{
353-
var probability = LookupNgramProbability(language, elem);
356+
var probability = LookupNgramProbability(language, elem, languageModelDirectory);
354357
if (probability > 0)
355358
{
356359
sum += Math.Log(probability);
@@ -362,9 +365,9 @@ internal static double ComputeSumOfNgramProbabilities(Language language, HashSet
362365
return sum;
363366
}
364367

365-
internal static double LookupNgramProbability(Language language, ReadOnlySpan<char> ngram)
368+
internal static double LookupNgramProbability(Language language, ReadOnlySpan<char> ngram, string languageModelDirectory)
366369
{
367-
var model = LoadLanguageModel(language, ngram.Length);
370+
var model = LoadLanguageModel(language, ngram.Length, languageModelDirectory);
368371
#if NET9_0
369372
var lookup = model.GetAlternateLookup<ReadOnlySpan<char>>();
370373
return lookup.TryGetValue(ngram, out var result) ? result : 0;
@@ -374,7 +377,7 @@ internal static double LookupNgramProbability(Language language, ReadOnlySpan<ch
374377
#endif
375378
}
376379

377-
private static FrozenDictionary<string, double> LoadLanguageModel(Language language, int ngramLength)
380+
private static FrozenDictionary<string, double> LoadLanguageModel(Language language, int ngramLength, string languageModelDirectory)
378381
{
379382
var languageModels = ngramLength switch
380383
{
@@ -387,7 +390,7 @@ private static FrozenDictionary<string, double> LoadLanguageModel(Language langu
387390
_ => throw new ArgumentException($"unsupported ngram length detected: ${ngramLength}")
388391
};
389392

390-
return LoadLanguageModels(languageModels, language, ngramLength);
393+
return LoadLanguageModels(languageModels, language, ngramLength, languageModelDirectory);
391394
}
392395

393396
private void PreloadLanguageModels()
@@ -405,33 +408,34 @@ private void PreloadLanguageModels()
405408
switch (ngramLength)
406409
{
407410
case 1:
408-
LoadLanguageModels(UnigramLanguageModels, language, ngramLength);
411+
LoadLanguageModels(UnigramLanguageModels, language, ngramLength, _languageModelsDirectory);
409412
break;
410413
case 2:
411-
LoadLanguageModels(BigramLanguageModels, language, ngramLength);
414+
LoadLanguageModels(BigramLanguageModels, language, ngramLength, _languageModelsDirectory);
412415
break;
413416
case 3:
414-
LoadLanguageModels(TrigramLanguageModels, language, ngramLength);
417+
LoadLanguageModels(TrigramLanguageModels, language, ngramLength, _languageModelsDirectory);
415418
break;
416419
case 4:
417-
LoadLanguageModels(QuadrigramLanguageModels, language, ngramLength);
420+
LoadLanguageModels(QuadrigramLanguageModels, language, ngramLength, _languageModelsDirectory);
418421
break;
419422
case 5:
420-
LoadLanguageModels(FivegramLanguageModels, language, ngramLength);
423+
LoadLanguageModels(FivegramLanguageModels, language, ngramLength, _languageModelsDirectory);
421424
break;
422425
}
423426
});
424427
}
425428

426-
private static FrozenDictionary<string, double> LoadLanguageModels(ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> languageModels, Language language, int ngramLength) =>
427-
languageModels.GetOrAdd(language, static (l, nl) =>
428-
new Lazy<FrozenDictionary<string, double>>(() => ReadLanguageModel(l, nl)), ngramLength).Value;
429+
private static FrozenDictionary<string, double> LoadLanguageModels(ConcurrentDictionary<Language, Lazy<FrozenDictionary<string, double>>> languageModels, Language language, int ngramLength, string languageModelDirectory) =>
430+
languageModels.GetOrAdd(language, static (l, arg) =>
431+
new Lazy<FrozenDictionary<string, double>>(() =>
432+
ReadLanguageModel(l, arg.ngramLength, arg.languageModelDirectory)), (ngramLength, languageModelDirectory)).Value;
429433

430-
private static FrozenDictionary<string, double> ReadLanguageModel(Language language, int ngramLength)
434+
private static FrozenDictionary<string, double> ReadLanguageModel(Language language, int ngramLength, string languageModelDirectory)
431435
{
432436
var isoCode = language.IsoCode6391().ToString().ToLowerInvariant();
433437
var nGramName = Ngram.GetNameByLength(ngramLength);
434-
var file = Path.Combine("Lingua", "LanguageModels", isoCode, $"{nGramName}s.json.br");
438+
var file = Path.Combine(languageModelDirectory, isoCode, $"{nGramName}s.json.br");
435439

436440
try
437441
{

src/Lingua/LanguageDetectorBuilder.cs

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ public class LanguageDetectorBuilder
99
private double _minimumRelativeDistance;
1010
private bool _isEveryLanguageModelPreloaded;
1111
private bool _isLowAccuracyModeEnabled;
12+
private string? _languageModelsDirectory;
1213

1314
private LanguageDetectorBuilder(HashSet<Language> languages) => _languages = languages;
1415

@@ -96,7 +97,7 @@ public static LanguageDetectorBuilder FromLanguages(params Language[] languages)
9697
/// <summary>
9798
/// Sets the desired value for the minimum relative distance measure.
9899
/// <para />
99-
/// By default, *Lingua* returns the most likely language for a given
100+
/// By default, Lingua returns the most likely language for a given
100101
/// input text. However, there are certain words that are spelled the
101102
/// same in more than one language. The word *prologue*, for instance,
102103
/// is both a valid English and French word. Lingua would output either
@@ -127,7 +128,7 @@ public LanguageDetectorBuilder WithMinimumRelativeDistance(double distance)
127128
/// <summary>
128129
/// Preloads all language models when creating the instance of <see cref="Lingua.LanguageDetector"/>.
129130
/// <para />
130-
/// By default, *Lingua* uses lazy-loading to load only those language models
131+
/// By default, Lingua uses lazy-loading to load only those language models
131132
/// on demand which are considered relevant by the rule-based filter engine.
132133
/// For web services, for instance, it is rather beneficial to preload all language
133134
/// models into memory to avoid unexpected latency while waiting for the
@@ -142,7 +143,7 @@ public LanguageDetectorBuilder WithPreloadedLanguageModels()
142143
/// <summary>
143144
/// Disables the high accuracy mode in order to save memory and increase performance.
144145
/// <para />
145-
/// By default, *Lingua's* high detection accuracy comes at the cost of
146+
/// By default, Lingua's high detection accuracy comes at the cost of
146147
/// loading large language models into memory which might not be feasible
147148
/// for systems running low on resources.
148149
/// <para />
@@ -158,6 +159,39 @@ public LanguageDetectorBuilder WithLowAccuracyMode()
158159
return this;
159160
}
160161

162+
/// <summary>
163+
/// The directory from which to load language models.
164+
/// <para />
165+
/// By default, Lingua loads language models from the directory "Lingua/LanguageModels" relative
166+
/// to <see cref="AppContext.BaseDirectory"/>.
167+
/// <para />
168+
/// Lingua expects language models to be stored in the directory following directory structure:
169+
/// <code>
170+
/// &lt;languageModelsDirectory>/&lt;ISO639-1 code>/&lt;Ngram length name>.br.json
171+
/// </code>
172+
/// where
173+
/// <list type="bullet">
174+
/// <item>
175+
/// &lt;languageModelsDirectory> is the directory specified by this method
176+
/// </item>
177+
/// <item>
178+
/// &lt;ISO639-1 code> is a directory with the ISO 639-1 code of the language models stored in it
179+
/// </item>
180+
/// <item>
181+
/// &lt;Ngram length name> is the name of the ngram length: unigrams, bigrams, trigrams, quadrigrams, fivegrams
182+
/// </item>
183+
/// </list>
184+
/// </summary>
185+
/// <exception cref="DirectoryNotFoundException">If the specified directory does not exist</exception>
186+
public LanguageDetectorBuilder WithLanguageModelsDirectory(string languageModelsDirectory)
187+
{
188+
if (!Directory.Exists(languageModelsDirectory))
189+
throw new DirectoryNotFoundException($"Directory '{languageModelsDirectory}' does not exist");
190+
191+
_languageModelsDirectory = languageModelsDirectory;
192+
return this;
193+
}
194+
161195
/// <summary>
162196
/// Builds a new instance of <see cref="Lingua.LanguageDetector"/>.
163197
/// </summary>
@@ -166,5 +200,6 @@ public LanguageDetector Build() =>
166200
new(_languages,
167201
_minimumRelativeDistance,
168202
_isEveryLanguageModelPreloaded,
169-
_isLowAccuracyModeEnabled);
203+
_isLowAccuracyModeEnabled,
204+
_languageModelsDirectory);
170205
}

0 commit comments

Comments
 (0)