Skip to content

Commit 337f855

Browse files
Check BOM of USDA files and report errors if found
1 parent c41f2c1 commit 337f855

File tree

6 files changed

+73
-7
lines changed

6 files changed

+73
-7
lines changed

pxr/usd/sdf/testenv/testSdfParsing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def test_Basic(self):
4747
# This will mean that your new test runs first and you can spot
4848
# failures much quicker.
4949
testFiles = '''
50+
230_bad_layer_with_utf32_BOM.usda
51+
229_bad_layer_with_utf16_BOM.usda
52+
228_bad_layer_with_utf8_BOM.usda
5053
227_arrayEdits.usda
5154
226_version_1.1.usda
5255
225_multiline_with_SplineKnotParamList.usda
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#usda 1.1
2+
3+
def "Prim1"
4+
{
5+
}
Binary file not shown.
Binary file not shown.

pxr/usd/sdf/usdaFileFormat.cpp

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,25 +177,76 @@ SdfUsdaFileFormat::~SdfUsdaFileFormat()
177177
namespace
178178
{
179179

180+
bool _CheckBOM(const char* bufferRead, size_t bufferSize) {
181+
// Check for UTF-8 BOM (EF BB BF)
182+
if (bufferSize >= 3 &&
183+
static_cast<unsigned char>(bufferRead[0]) == 0xEF &&
184+
static_cast<unsigned char>(bufferRead[1]) == 0xBB &&
185+
static_cast<unsigned char>(bufferRead[2]) == 0xBF) {
186+
TF_WARN("Asset starts with UTF-8 BOM which is not supported, "
187+
"please convert the file to UTF-8 without the BOM.");
188+
return false;
189+
}
190+
191+
// Check for UTF-16 BOM markers (FE FF or FF FE)
192+
if (bufferSize >= 2 &&
193+
((static_cast<unsigned char>(bufferRead[0]) == 0xFE &&
194+
static_cast<unsigned char>(bufferRead[1]) == 0xFF) ||
195+
(static_cast<unsigned char>(bufferRead[0]) == 0xFF &&
196+
static_cast<unsigned char>(bufferRead[1]) == 0xFE))) {
197+
TF_WARN("Asset starts with UTF-16 BOM marker which is not supported, "
198+
"please convert the file to UTF-8 without the BOM.");
199+
return false;
200+
}
201+
202+
// Check for UTF-32 BOM markers (00 00 FE FF or FF FE 00 00)
203+
if (bufferSize >= 4 &&
204+
((static_cast<unsigned char>(bufferRead[0]) == 0x00 &&
205+
static_cast<unsigned char>(bufferRead[1]) == 0x00 &&
206+
static_cast<unsigned char>(bufferRead[2]) == 0xFE &&
207+
static_cast<unsigned char>(bufferRead[3]) == 0xFF) ||
208+
(static_cast<unsigned char>(bufferRead[0]) == 0xFF &&
209+
static_cast<unsigned char>(bufferRead[1]) == 0xFE &&
210+
static_cast<unsigned char>(bufferRead[2]) == 0x00 &&
211+
static_cast<unsigned char>(bufferRead[3]) == 0x00))) {
212+
TF_WARN("Asset starts with UTF-32 BOM marker which is not supported, "
213+
"please convert the file to UTF-8 without the BOM.");
214+
return false;
215+
}
216+
217+
return true;
218+
}
219+
180220
bool
181221
_CanReadImpl(const std::shared_ptr<ArAsset>& asset,
182-
const std::string& cookie)
222+
const std::string& cookie,
223+
bool bomCheckWarning = true)
183224
{
184225
TfErrorMark mark;
185226

227+
constexpr size_t BOM_CHECK_SIZE = 4;
186228
constexpr size_t COOKIE_BUFFER_SIZE = 512;
187229
char local[COOKIE_BUFFER_SIZE];
188230
std::unique_ptr<char []> remote;
189231
char *buf = local;
190232
size_t cookieLength = cookie.length();
191-
if (cookieLength > COOKIE_BUFFER_SIZE - 1) {
192-
remote.reset(new char[cookieLength + 1]);
233+
if (BOM_CHECK_SIZE + cookieLength > COOKIE_BUFFER_SIZE - 1) {
234+
remote = std::make_unique<char[]>(cookieLength + BOM_CHECK_SIZE + 1);
193235
buf = remote.get();
194236
}
195-
if (asset->Read(buf, cookieLength, /* offset = */ 0) != cookieLength) {
237+
// Maximum 4 bytes are needed to check for BOM markers.
238+
size_t bytesRead = asset->Read(buf, BOM_CHECK_SIZE + cookieLength, /* offset = */ 0);
239+
// At least the cookie length is needed to check for the cookie.
240+
if (bytesRead < cookieLength) {
241+
return false;
242+
}
243+
244+
// Check bom markers if requested
245+
if (bomCheckWarning && !_CheckBOM(buf, bytesRead)) {
196246
return false;
197247
}
198248

249+
// It doesn't have BOM markers, so we can check the cookie.
199250
buf[cookieLength] = '\0';
200251

201252
// Don't allow errors to escape this function, since this function is
@@ -224,15 +275,15 @@ SdfUsdaFileFormat::CanRead(const string& filePath) const
224275

225276
std::shared_ptr<ArAsset> asset = ArGetResolver().OpenAsset(
226277
ArResolvedPath(filePath));
227-
return asset && _CanReadImpl(asset, GetFileCookie());
278+
return asset && _CanReadImpl(asset, GetFileCookie(), false);
228279
}
229280

230281
bool
231282
SdfUsdaFileFormat::_CanReadFromAsset(
232283
const std::string& resolvedPath,
233284
const std::shared_ptr<ArAsset>& asset) const
234285
{
235-
return _CanReadImpl(asset, GetFileCookie());
286+
return _CanReadImpl(asset, GetFileCookie(), false);
236287
}
237288

238289
bool
@@ -261,7 +312,7 @@ SdfUsdaFileFormat::_ReadFromAsset(
261312
{
262313
// Quick check to see if the file has the magic cookie before spinning up
263314
// the parser.
264-
if (!_CanReadImpl(asset, GetFileCookie())) {
315+
if (!_CanReadImpl(asset, GetFileCookie(), true)) {
265316
TF_RUNTIME_ERROR("<%s> is not a valid %s layer",
266317
resolvedPath.c_str(),
267318
GetFormatId().GetText());

pxr/usd/usd/docs/utf8Overview.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ character and string. Users should think of UTF-8 strings as bytes representing
1818
"code points" in the Unicode code charts. A single code point may be represented
1919
by 1, 2, 3, or 4 byte sequences.
2020

21+
## Byte Order Mark (BOM)
22+
23+
The USD parser does not support Byte Order Marks (BOMs) at the beginning of USDA
24+
files. Files containing a BOM will be treated as invalid layers. To fix this,
25+
open your USDA file in a text editor and ensure it's saved without a BOM (most
26+
modern text editors default to UTF-8 without BOM).
27+
2128
### Replacement Code Point {#Usd_UTF_8_Encoding_Replacement}
2229

2330
Not every 1, 2, 3, or 4 byte sequence represents a valid UTF-8 code point. When

0 commit comments

Comments
 (0)