88namespace Lingua ;
99
1010/// <summary>
11- /// Detects language of given input text, and computes confidence values for every language considered possible
12- /// for given input text.
11+ /// Detects language of a given input text and computes confidence values for every language considered possible
12+ /// for the given input text.
1313/// </summary>
1414public sealed partial class LanguageDetector
1515{
@@ -85,6 +85,7 @@ public sealed partial class LanguageDetector
8585 private readonly bool _isLowAccuracyModeEnabled ;
8686 private readonly Dictionary < Alphabet , Language > _oneLanguageAlphabets ;
8787 private readonly IEnumerable < Language > _languagesWithUniqueCharacters ;
88+ private readonly string _languageModelsDirectory ;
8889
8990 private static readonly int [ ] LowAccuracyRange = [ 3 ] ;
9091 private static readonly int [ ] HighAccuracyRange = [ 1 , 2 , 3 , 4 , 5 ] ;
@@ -95,14 +96,16 @@ internal LanguageDetector(
9596 HashSet < Language > languages ,
9697 double minimumRelativeDistance = 0 ,
9798 bool isEveryLanguageModelPreloaded = false ,
98- bool isLowAccuracyModeEnabled = false )
99+ bool isLowAccuracyModeEnabled = false ,
100+ string ? languageModelsDirectory = null )
99101 {
100102 _languages = languages ;
101103 _minimumRelativeDistance = minimumRelativeDistance ;
102104 _isLowAccuracyModeEnabled = isLowAccuracyModeEnabled ;
103105 _oneLanguageAlphabets = AlphabetExtensions . AllSupportingExactlyOneLanguage ( )
104106 . Where ( a => languages . Contains ( a . Value ) ) . ToDictionary ( ) ;
105107 _languagesWithUniqueCharacters = languages . Where ( l => ! string . IsNullOrWhiteSpace ( l . UniqueCharacters ( ) ) ) ;
108+ _languageModelsDirectory = languageModelsDirectory ?? Path . Combine ( AppContext . BaseDirectory , "Lingua" , "LanguageModels" ) ;
106109
107110 if ( isEveryLanguageModelPreloaded )
108111 PreloadLanguageModels ( ) ;
@@ -195,13 +198,13 @@ public IDictionary<Language, double> ComputeLanguageConfidenceValues(string text
195198 Parallel . ForEach ( ngramSizeRange , i =>
196199 {
197200 var testDataModel = TestLanguageModel . FromText ( cleanedUpText , i ) ;
198- var probabilities = ComputeLanguageProbabilities ( testDataModel , filteredLanguages ) ;
201+ var probabilities = ComputeLanguageProbabilities ( testDataModel , filteredLanguages , _languageModelsDirectory ) ;
199202 if ( i == 1 )
200203 {
201204 var unigramFilteredLanguages = _languages . Count > 0
202205 ? filteredLanguages . Where ( f => _languages . Contains ( f ) ) . ToHashSet ( )
203206 : filteredLanguages ;
204- unigramCounts = CountUnigrams ( testDataModel , unigramFilteredLanguages ) ;
207+ unigramCounts = CountUnigrams ( testDataModel , unigramFilteredLanguages , _languageModelsDirectory ) ;
205208 }
206209 allProbabilities [ i - startValue ] = probabilities ;
207210 } ) ;
@@ -314,14 +317,14 @@ private static Dictionary<Language, double> SumUpProbabilities(
314317 return summedUpProbabilities ;
315318 }
316319
317- private static Dictionary < Language , int > CountUnigrams ( TestLanguageModel unigramLanguageModel , HashSet < Language > filteredLanguages )
320+ private static Dictionary < Language , int > CountUnigrams ( TestLanguageModel unigramLanguageModel , HashSet < Language > filteredLanguages , string languageModelDirectory )
318321 {
319322 var unigramCounts = new Dictionary < Language , int > ( ) ;
320323 foreach ( var language in filteredLanguages )
321324 {
322325 foreach ( var unigram in unigramLanguageModel . Ngrams )
323326 {
324- var probability = LookupNgramProbability ( language , unigram . AsSpan ( ) ) ;
327+ var probability = LookupNgramProbability ( language , unigram . AsSpan ( ) , languageModelDirectory ) ;
325328 if ( probability > 0 )
326329 unigramCounts . IncrementCounter ( language ) ;
327330 }
@@ -330,27 +333,27 @@ private static Dictionary<Language, int> CountUnigrams(TestLanguageModel unigram
330333 return unigramCounts ;
331334 }
332335
333- internal static Dictionary < Language , double > ComputeLanguageProbabilities ( TestLanguageModel testModel , IReadOnlySet < Language > filteredLanguages )
336+ internal static Dictionary < Language , double > ComputeLanguageProbabilities ( TestLanguageModel testModel , IReadOnlySet < Language > filteredLanguages , string languageModelDirectory )
334337 {
335338 var probabilities = new Dictionary < Language , double > ( ) ;
336339 foreach ( var language in filteredLanguages )
337340 {
338- var sum = ComputeSumOfNgramProbabilities ( language , testModel . Ngrams ) ;
341+ var sum = ComputeSumOfNgramProbabilities ( language , testModel . Ngrams , languageModelDirectory ) ;
339342 if ( sum < 0 )
340343 probabilities [ language ] = sum ;
341344 }
342345
343346 return probabilities ;
344347 }
345348
346- internal static double ComputeSumOfNgramProbabilities ( Language language , HashSet < Ngram > ngrams )
349+ internal static double ComputeSumOfNgramProbabilities ( Language language , HashSet < Ngram > ngrams , string languageModelDirectory )
347350 {
348351 var sum = 0d ;
349352 foreach ( var ngram in ngrams )
350353 {
351354 foreach ( var elem in ngram . LowerOrderNGrams ( ) )
352355 {
353- var probability = LookupNgramProbability ( language , elem ) ;
356+ var probability = LookupNgramProbability ( language , elem , languageModelDirectory ) ;
354357 if ( probability > 0 )
355358 {
356359 sum += Math . Log ( probability ) ;
@@ -362,9 +365,9 @@ internal static double ComputeSumOfNgramProbabilities(Language language, HashSet
362365 return sum ;
363366 }
364367
365- internal static double LookupNgramProbability ( Language language , ReadOnlySpan < char > ngram )
368+ internal static double LookupNgramProbability ( Language language , ReadOnlySpan < char > ngram , string languageModelDirectory )
366369 {
367- var model = LoadLanguageModel ( language , ngram . Length ) ;
370+ var model = LoadLanguageModel ( language , ngram . Length , languageModelDirectory ) ;
368371#if NET9_0
369372 var lookup = model . GetAlternateLookup < ReadOnlySpan < char > > ( ) ;
370373 return lookup . TryGetValue ( ngram , out var result ) ? result : 0 ;
@@ -374,7 +377,7 @@ internal static double LookupNgramProbability(Language language, ReadOnlySpan<ch
374377#endif
375378 }
376379
377- private static FrozenDictionary < string , double > LoadLanguageModel ( Language language , int ngramLength )
380+ private static FrozenDictionary < string , double > LoadLanguageModel ( Language language , int ngramLength , string languageModelDirectory )
378381 {
379382 var languageModels = ngramLength switch
380383 {
@@ -387,7 +390,7 @@ private static FrozenDictionary<string, double> LoadLanguageModel(Language langu
387390 _ => throw new ArgumentException ( $ "unsupported ngram length detected: ${ ngramLength } ")
388391 } ;
389392
390- return LoadLanguageModels ( languageModels , language , ngramLength ) ;
393+ return LoadLanguageModels ( languageModels , language , ngramLength , languageModelDirectory ) ;
391394 }
392395
393396 private void PreloadLanguageModels ( )
@@ -405,33 +408,34 @@ private void PreloadLanguageModels()
405408 switch ( ngramLength )
406409 {
407410 case 1 :
408- LoadLanguageModels ( UnigramLanguageModels , language , ngramLength ) ;
411+ LoadLanguageModels ( UnigramLanguageModels , language , ngramLength , _languageModelsDirectory ) ;
409412 break ;
410413 case 2 :
411- LoadLanguageModels ( BigramLanguageModels , language , ngramLength ) ;
414+ LoadLanguageModels ( BigramLanguageModels , language , ngramLength , _languageModelsDirectory ) ;
412415 break ;
413416 case 3 :
414- LoadLanguageModels ( TrigramLanguageModels , language , ngramLength ) ;
417+ LoadLanguageModels ( TrigramLanguageModels , language , ngramLength , _languageModelsDirectory ) ;
415418 break ;
416419 case 4 :
417- LoadLanguageModels ( QuadrigramLanguageModels , language , ngramLength ) ;
420+ LoadLanguageModels ( QuadrigramLanguageModels , language , ngramLength , _languageModelsDirectory ) ;
418421 break ;
419422 case 5 :
420- LoadLanguageModels ( FivegramLanguageModels , language , ngramLength ) ;
423+ LoadLanguageModels ( FivegramLanguageModels , language , ngramLength , _languageModelsDirectory ) ;
421424 break ;
422425 }
423426 } ) ;
424427 }
425428
426- private static FrozenDictionary < string , double > LoadLanguageModels ( ConcurrentDictionary < Language , Lazy < FrozenDictionary < string , double > > > languageModels , Language language , int ngramLength ) =>
427- languageModels . GetOrAdd ( language , static ( l , nl ) =>
428- new Lazy < FrozenDictionary < string , double > > ( ( ) => ReadLanguageModel ( l , nl ) ) , ngramLength ) . Value ;
429+ private static FrozenDictionary < string , double > LoadLanguageModels ( ConcurrentDictionary < Language , Lazy < FrozenDictionary < string , double > > > languageModels , Language language , int ngramLength , string languageModelDirectory ) =>
430+ languageModels . GetOrAdd ( language , static ( l , arg ) =>
431+ new Lazy < FrozenDictionary < string , double > > ( ( ) =>
432+ ReadLanguageModel ( l , arg . ngramLength , arg . languageModelDirectory ) ) , ( ngramLength , languageModelDirectory ) ) . Value ;
429433
430- private static FrozenDictionary < string , double > ReadLanguageModel ( Language language , int ngramLength )
434+ private static FrozenDictionary < string , double > ReadLanguageModel ( Language language , int ngramLength , string languageModelDirectory )
431435 {
432436 var isoCode = language . IsoCode6391 ( ) . ToString ( ) . ToLowerInvariant ( ) ;
433437 var nGramName = Ngram . GetNameByLength ( ngramLength ) ;
434- var file = Path . Combine ( "Lingua" , "LanguageModels" , isoCode , $ "{ nGramName } s.json.br") ;
438+ var file = Path . Combine ( languageModelDirectory , isoCode , $ "{ nGramName } s.json.br") ;
435439
436440 try
437441 {
0 commit comments