Skip to content

Commit 10ca424

Browse files
committed
adding encoding detection as well as explicit constructor parameter
1 parent 568acd9 commit 10ca424

6 files changed

Lines changed: 238 additions & 19 deletions

File tree

src/EmbeddedResources.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public Stream GetStream(string resourceName)
7575
case 1:
7676
return _assembly.GetManifestResourceStream(possibleCandidates[0]);
7777
default:
78-
throw new ArgumentException("Ambiguous name, cannot identify resource", "resName");
78+
throw new ArgumentException("Ambiguous name, cannot identify resource", nameof(resourceName));
7979
}
8080
}
8181
}

src/Parameters/Shared/SharedParameterFile.cs

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1-
using CsvHelper;
1+
using CsvHelper;
22
using CsvHelper.Configuration;
33
using System;
44
using System.Collections;
55
using System.Collections.Generic;
66
using System.ComponentModel.DataAnnotations;
77
using System.Diagnostics;
8+
using System.Globalization;
89
using System.IO;
910
using System.Linq;
1011
using System.Text;
12+
using System.Text.RegularExpressions;
1113

1214
namespace CodeCave.Revit.Toolkit.Parameters.Shared
1315
{
@@ -37,21 +39,22 @@ public SharedParameterFile(Meta metadata = null, IEnumerable<Group> groups = nul
3739
/// Initializes a new instance of the <see cref="T:CodeCave.Revit.Toolkit.Parameters.Shared.SharedParameterFile" /> class.
3840
/// </summary>
3941
/// <param name="sharedParameterFile">The shared parameter file.</param>
42+
/// <param name="encoding">The encoding to use, fallbacks to UTF-8.</param>
4043
/// ReSharper disable once SuggestBaseTypeForParameter
4144
/// <inheritdoc />
42-
public SharedParameterFile(FileInfo sharedParameterFile)
43-
: this(File.ReadAllText(sharedParameterFile?.FullName ?? throw new InvalidOperationException()))
44-
{
45-
}
45+
public SharedParameterFile(FileInfo sharedParameterFile, Encoding encoding = null)
46+
: this(sharedParameterFile?.FullName, encoding)
47+
{}
4648

4749
/// <summary>
48-
/// Initializes a new instance of the <see cref="SharedParameterFile"/> class.
50+
/// Initializes a new instance of the <see cref="SharedParameterFile" /> class.
4951
/// </summary>
5052
/// <param name="sharedParameterFile">The shared parameter file.</param>
53+
/// <param name="encoding">The encoding to use, fallbacks to UTF-8.</param>
5154
/// <exception cref="ArgumentException">sharedParameterFile</exception>
5255
/// <exception cref="InvalidDataException">Failed to parse shared parameter file content," +
53-
/// "because it doesn't contain enough data for being qualified as a valid shared parameter file.</exception>
54-
public SharedParameterFile(string sharedParameterFile)
56+
/// "because it doesn't contain enough data for being qualified as a valid shared parameter file.</exception>
57+
public SharedParameterFile(string sharedParameterFile, Encoding encoding = null)
5558
{
5659
if (string.IsNullOrWhiteSpace(sharedParameterFile))
5760
{
@@ -60,7 +63,8 @@ public SharedParameterFile(string sharedParameterFile)
6063

6164
if (!SectionRegex.IsMatch(sharedParameterFile) && File.Exists(sharedParameterFile))
6265
{
63-
sharedParameterFile = File.ReadAllText(sharedParameterFile);
66+
Encoding = encoding ?? new FileInfo(sharedParameterFile).GetEncoding();
67+
sharedParameterFile = File.ReadAllText(sharedParameterFile, Encoding);
6468
}
6569

6670
var sharedParamsFileLines = SectionRegex
@@ -168,6 +172,14 @@ public SharedParameterFile(string sharedParameterFile)
168172
.ToList();
169173
}
170174

175+
/// <summary>
176+
/// Gets the encoding.
177+
/// </summary>
178+
/// <value>
179+
/// The encoding.
180+
/// </value>
181+
public Encoding Encoding { get; }
182+
171183
/// <summary>
172184
/// Gets or sets the meta-data section of the shared parameter file.
173185
/// </summary>
@@ -182,15 +194,15 @@ public SharedParameterFile(string sharedParameterFile)
182194
/// <value>
183195
/// The groups section of the shared parameter file.
184196
/// </value>
185-
public List<Group> Groups { get; } = new List<Group>();
197+
public List<Group> Groups { get; }
186198

187199
/// <summary>
188200
/// Gets or sets the parameters section of the shared parameter file.
189201
/// </summary>
190202
/// <value>
191203
/// The parameters section of the shared parameter file.
192204
/// </value>
193-
public List<Parameter> Parameters { get; } = new List<Parameter>();
205+
public List<Parameter> Parameters { get; }
194206

195207
/// <summary>
196208
/// Returns a <see cref="String" /> that represents this instance.

src/Parameters/Shared/SharedParameterFile_Data.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
using CsvHelper.Configuration;
44
using CsvHelper.TypeConversion;
55
using System.Collections.Generic;
6-
using System.Diagnostics;
76
using System.IO;
87
using System.Linq;
8+
using System.Text;
99
using System.Text.RegularExpressions;
1010

1111
namespace CodeCave.Revit.Toolkit.Parameters.Shared
@@ -38,6 +38,11 @@ static SharedParameterFile()
3838
DetectColumnCountChanges = false,
3939
QuoteNoFields = true
4040
};
41+
42+
#if !NET452
43+
// Allow the usage of ANSI encoding other than the default one
44+
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
45+
#endif
4146
}
4247

4348
/// <summary>
@@ -174,7 +179,7 @@ public class Parameter : IDefinition, IParameter
174179
/// <value>
175180
/// The type of the unit.
176181
/// </value>
177-
public UnitType UnitType { get; set; } = UnitType.UT_Undefined;
182+
public UnitType UnitType { get; internal set; } = UnitType.UT_Undefined;
178183

179184
/// <inheritdoc />
180185
/// <summary>
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
using System.IO;
2+
using System.Linq;
3+
using System.Text.RegularExpressions;
4+
5+
// ReSharper disable once CheckNamespace
6+
namespace System.Text
7+
{
8+
public static class FileInfoExtensions
9+
{
10+
public static readonly Regex UnicodeLetters;
11+
public static readonly Regex AnsiLatin1Mangled;
12+
13+
/// <summary>
14+
/// Initializes the <see cref="FileInfoExtensions"/> class.
15+
/// </summary>
16+
static FileInfoExtensions()
17+
{
18+
AnsiLatin1Mangled = new Regex(@"[ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿŸ]", RegexOptions.Compiled | RegexOptions.Multiline);
19+
UnicodeLetters = new Regex(@"\p{L}", RegexOptions.Compiled | RegexOptions.Multiline);
20+
}
21+
22+
/// <summary>
23+
/// Tries the get file encoding.
24+
/// </summary>
25+
/// <param name="file">The file.</param>
26+
/// <param name="encoding">The encoding.</param>
27+
/// <returns></returns>
28+
/// <exception cref="ArgumentException">file</exception>
29+
public static bool TryGetEncoding(this FileInfo file, out Encoding encoding)
30+
{
31+
if (null == file || !file.Exists)
32+
throw new ArgumentException($"{nameof(file)} must be a valid path to a file!");
33+
34+
var bytes = new byte[10];
35+
using (var fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read))
36+
{
37+
fs.Read(bytes, 0, 10);
38+
fs.Close();
39+
}
40+
41+
switch (bytes)
42+
{
43+
case var utf7 when 0x2B == utf7[0] && 0x2F == utf7[1] && 0x76 == utf7[2]:
44+
encoding = Encoding.UTF7;
45+
break;
46+
47+
case var utf32 when 0 == utf32[0] && 0 == utf32[1] && 0xFE == utf32[2] && 0xFF == utf32[3]:
48+
encoding = Encoding.UTF32;
49+
break;
50+
51+
case var unicode when 0xFE == unicode[0] && 0xFF == unicode[1]:
52+
encoding = Encoding.GetEncoding(1201); // 1201 unicodeFFFE Unicode (UTF-16BE)
53+
break;
54+
55+
case var unicode when 0xFF == unicode[0] && 0xFE == unicode[1]:
56+
encoding = Encoding.GetEncoding(1200); // 1200 UTF-16 Unicode (UTF-16LE)
57+
break;
58+
59+
case var utf8 when HasBomMarker(utf8):
60+
encoding = new UTF8Encoding(true); // UTF-8 with BOM
61+
break;
62+
63+
case var _ when file.IsInUtf8():
64+
encoding = new UTF8Encoding(false); // UTF-8 without BOM
65+
break;
66+
67+
case var _ when file.IsInAnsiLatin1():
68+
encoding = Encoding.GetEncoding(1252); // UTF-8 without BOM
69+
break;
70+
71+
default:
72+
encoding = null;
73+
return false;
74+
}
75+
76+
return true;
77+
}
78+
79+
/// <summary>
80+
/// Gets the encoding.
81+
/// </summary>
82+
/// <param name="file">The file.</param>
83+
/// <param name="throwIfNotDetected">if set to <c>true</c> [throws an exception if encoding is not detected].</param>
84+
/// <returns>
85+
/// Detected file encoding or null detection failed
86+
/// </returns>
87+
/// <exception cref="ArgumentException">file</exception>
88+
/// <exception cref="InvalidDataException"></exception>
89+
public static Encoding GetEncoding(this FileInfo file, bool throwIfNotDetected = false)
90+
{
91+
if (null == file || !file.Exists)
92+
throw new ArgumentException($"{nameof(file)} must be a valid path to a file!");
93+
94+
var encoding = (file.TryGetEncoding(out var fileEncoding))
95+
? fileEncoding
96+
: null;
97+
98+
if (null == encoding && throwIfNotDetected)
99+
{
100+
throw new InvalidDataException(
101+
$"Unable to detect encoding automatically of the following shared parameter file: {file.FullName}. " +
102+
"Most likely it's a non-Latin ANSI, e.g. ANSI Cyrillic, Hebrew, Arabic, Greek, Turkish, Vietnamese etc"
103+
);
104+
}
105+
106+
return encoding;
107+
}
108+
109+
/// <summary>
110+
/// Determines whether [is in ANSI latin1] [the specified thresh hold].
111+
/// </summary>
112+
/// <param name="file">The file.</param>
113+
/// <param name="mangledCharThreshold">The threshold of mangled characters.</param>
114+
/// <returns>
115+
/// <c>true</c> if [file is ANSI Latin1-encoded] and [the number of mangled characters is lower than specified threshold]; otherwise, <c>false</c>.
116+
/// </returns>
117+
/// <exception cref="ArgumentException">file</exception>
118+
public static bool IsInAnsiLatin1(this FileInfo file, double mangledCharThreshold = 60.0)
119+
{
120+
if (null == file || !file.Exists)
121+
throw new ArgumentException($"{nameof(file)} must be a valid path to a file!");
122+
123+
var ansiLatin1Encoding = Encoding.GetEncoding(1252);
124+
var ansiText = File.ReadAllText(file.FullName, ansiLatin1Encoding);
125+
126+
var unicodeLettersFound = UnicodeLetters.Matches(ansiText);
127+
var ansiMangledFound = AnsiLatin1Mangled.Matches(ansiText);
128+
var matchRate = ansiMangledFound.Count * 100 / unicodeLettersFound.Count;
129+
return (matchRate <= mangledCharThreshold);
130+
}
131+
132+
/// <summary>
133+
/// Determines whether [has UTF-8 BOM marker].
134+
/// </summary>
135+
/// <param name="file">The file.</param>
136+
/// <returns>
137+
/// <c>true</c> if [the specified file] [has UTF-8 BOM marker]; otherwise, <c>false</c>.
138+
/// </returns>
139+
/// <exception cref="ArgumentException">file</exception>
140+
public static bool HasBomMarker(this FileInfo file)
141+
{
142+
if (null == file || !file.Exists)
143+
throw new ArgumentException($"{nameof(file)} must be a valid path to a file!");
144+
145+
var buffer = new byte[10];
146+
using (var fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read))
147+
{
148+
fs.Read(buffer, 0, 10);
149+
fs.Close();
150+
}
151+
152+
return 0xEF == buffer[0] && 0xBB == buffer[1] && 0xBF == buffer[2];
153+
}
154+
155+
/// <summary>
156+
/// Determines whether [has UTF-8 BOM arker] [the specified bytes].
157+
/// </summary>
158+
/// <param name="bytes">The bytes.</param>
159+
/// <returns>
160+
/// <c>true</c> if [the specified bytes] [has UTF-8 BOM marker]; otherwise, <c>false</c>.
161+
/// </returns>
162+
/// <exception cref="ArgumentException">bytes</exception>
163+
public static bool HasBomMarker(byte[] bytes)
164+
{
165+
if (bytes == null || !bytes.Any())
166+
throw new ArgumentException($"{nameof(bytes)} must be a non-empty array of bytes!");
167+
168+
return 0xEF == bytes[0] && 0xBB == bytes[1] && 0xBF == bytes[2];
169+
}
170+
171+
/// <summary>
172+
/// Determines whether [is in UTF-8].
173+
/// </summary>
174+
/// <param name="file">The file.</param>
175+
/// <returns>
176+
/// <c>true</c> if[the specified file] [is in UTF-8]; otherwise, <c>false</c>.
177+
/// </returns>
178+
/// <exception cref="ArgumentException">file</exception>
179+
public static bool IsInUtf8(this FileInfo file)
180+
{
181+
if (null == file || !file.Exists)
182+
throw new ArgumentException($"{nameof(file)} must be a valid path to a file!");
183+
184+
try
185+
{
186+
using (var fileStream = new FileStream(file.FullName, FileMode.Open, FileAccess.Read))
187+
{
188+
using (var streamReader = new StreamReader(fileStream, new UTF8Encoding(file.HasBomMarker(), true), true))
189+
{
190+
streamReader.ReadToEnd();
191+
}
192+
}
193+
return true;
194+
}
195+
catch (DecoderFallbackException)
196+
{
197+
return false;
198+
}
199+
}
200+
}
201+
}

src/Revit.Toolkit.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
<PackageReference Include="OpenMcdf-2" Version="2.1.1.5" />
6363
<PackageReference Include="System.ComponentModel.Annotations" Version="4.5.0" />
6464
<PackageReference Include="System.Drawing.Common" Version="4.5.0-preview1-25914-04" Condition="'$(TargetFramework)' == 'netstandard2.0'" />
65+
<PackageReference Include="System.Text.Encoding.CodePages" Version="4.5.0" Condition="'$(TargetFramework)' != 'net452'" />
6566
</ItemGroup>
6667

6768
<ItemGroup>

tests/SharedParameterFileTests.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ public void MetaIsParsedCorrectly()
6060
sharedParamFilePath =>
6161
{
6262
var sharedParamFile = new SharedParameterFile(sharedParamFilePath);
63-
var sharedParamFileText = File.ReadAllLines(sharedParamFilePath);
63+
var sharedParamFileText = File.ReadAllLines(sharedParamFilePath, sharedParamFile.Encoding);
6464
var metaRow = $"META\t{sharedParamFile.Metadata.Version}\t{sharedParamFile.Metadata.MinVersion}";
6565
var containsMeta = sharedParamFileText.Any(line => line.Contains(metaRow));
6666
Assert.True(containsMeta);
@@ -80,7 +80,7 @@ public void GroupsCountIsCorrect()
8080
sharedParamFilePath =>
8181
{
8282
var sharedParamFile = new SharedParameterFile(sharedParamFilePath);
83-
var sharedParamFileText = File.ReadAllText(sharedParamFilePath);
83+
var sharedParamFileText = File.ReadAllText(sharedParamFilePath, sharedParamFile.Encoding);
8484
var paramLineMatches = groupLineRegex.Matches(sharedParamFileText);
8585
Assert.Equal(paramLineMatches.Count, sharedParamFile.Groups?.Count);
8686
}
@@ -99,7 +99,7 @@ public void ParametersCountIsCorrect()
9999
sharedParamFilePath =>
100100
{
101101
var sharedParamFile = new SharedParameterFile(sharedParamFilePath);
102-
var sharedParamFileText = File.ReadAllText(sharedParamFilePath);
102+
var sharedParamFileText = File.ReadAllText(sharedParamFilePath, sharedParamFile.Encoding);
103103
var paramLineMatches = paramLineRegex.Matches(sharedParamFileText);
104104
Assert.Equal(paramLineMatches.Count, sharedParamFile.Parameters?.Count);
105105
}
@@ -121,7 +121,7 @@ public void ParametersAreParsedCorrectly()
121121
sharedParamFilePath =>
122122
{
123123
var sharedParamFile = new SharedParameterFile(sharedParamFilePath);
124-
var sharedParamFileText = File.ReadAllText(sharedParamFilePath);
124+
var sharedParamFileText = File.ReadAllText(sharedParamFilePath, sharedParamFile.Encoding);
125125
var paramLineMatches = paramLineRegex.Matches(sharedParamFileText);
126126
var paramNames = paramLineMatches.Select(m => m.Groups["name"]?.Value.Trim()).ToArray();
127127
var paramGuids = paramLineMatches.Select(m => m.Groups["guid"].Value).Select(g => new Guid(g)).ToArray();
@@ -152,7 +152,7 @@ public void GroupsAreParsedCorrectly()
152152
sharedParamFilePath =>
153153
{
154154
var sharedParamFile = new SharedParameterFile(sharedParamFilePath);
155-
var sharedParamFileText = File.ReadAllText(sharedParamFilePath);
155+
var sharedParamFileText = File.ReadAllText(sharedParamFilePath, sharedParamFile.Encoding);
156156
var groupLineMatches = groupLineRegex.Matches(sharedParamFileText);
157157
var groupIds = groupLineMatches.Select(m => m.Groups["id"].Value).Select(int.Parse).ToArray();
158158
var groupNames = groupLineMatches.Select(m => m.Groups["name"].Value).Select(name => name.TrimEnd('\t', '\r')).ToArray();

0 commit comments

Comments
 (0)