Skip to content
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 93 additions & 33 deletions sherpa-onnx/csrc/file-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,30 @@
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/stat.h>
#include <unistd.h>
#include <limits.h>
#include <stdlib.h>
#endif

#include "sherpa-onnx/csrc/macros.h"

namespace sherpa_onnx {
std::wstring ToWideString(const std::string &s);
std::string ToString(const std::wstring &s);
Comment on lines 23 to +26
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ToWideString/ToString are forward-declared here but defined in text-utils.h/.cc. Prefer including the header instead of duplicating declarations so the dependency is explicit and signatures stay in sync.

Suggested change
namespace sherpa_onnx {
std::wstring ToWideString(const std::string &s);
std::string ToString(const std::wstring &s);
#include "sherpa-onnx/csrc/text-utils.h"
namespace sherpa_onnx {

Copilot uses AI. Check for mistakes.

bool FileExists(const std::string &filename) {
return std::ifstream(filename).good();
try {
#ifdef _WIN32
DWORD attributes = GetFileAttributesW(ToWideString(filename).c_str());

return attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY);
#else
struct stat file_stat;
return stat(filename.c_str(), &file_stat) == 0 && S_ISREG(file_stat.st_mode);
#endif
} catch (const std::exception&) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you describe which function can throw in the code?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

谢谢,确实触发概率太低,我将去掉

return false;
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file now catches std::exception, but it does not include <exception>. Add the direct standard header to avoid relying on transitive includes.

Copilot uses AI. Check for mistakes.
}
}

void AssertFileExists(const std::string &filename) {
Expand All @@ -33,20 +47,61 @@ void AssertFileExists(const std::string &filename) {
}

std::vector<char> ReadFile(const std::string &filename) {
std::ifstream file(filename, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
if (filename.empty()) {
return {};
}
try {
#ifdef _WIN32
HANDLE hFile = CreateFileW(
ToWideString(filename).c_str(),
GENERIC_READ,
FILE_SHARE_READ,
nullptr,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
nullptr
);

if (hFile == INVALID_HANDLE_VALUE) {
return {};
}

std::unique_ptr<void, decltype(&CloseHandle)> file_guard(
hFile, CloseHandle);

LARGE_INTEGER file_size;
if (!GetFileSizeEx(hFile, &file_size) || file_size.QuadPart > SIZE_MAX) {
return {};
}

std::vector<char> buffer(static_cast<size_t>(file_size.QuadPart));

DWORD bytes_read = 0;
if (!::ReadFile(hFile, buffer.data(), static_cast<DWORD>(buffer.size()), &bytes_read, nullptr) ||
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition is toooooo complicated. Please use multilines of code to simplify it.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好滴

bytes_read != buffer.size()) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Examine the file around lines 73-81
nl -ba sherpa-onnx/csrc/file-utils.cc | sed -n '65,90p'

Repository: k2-fsa/sherpa-onnx

Length of output: 102


🏁 Script executed:

#!/bin/bash
# Examine the file around lines 73-81 using cat
cat -n sherpa-onnx/csrc/file-utils.cc | sed -n '65,90p'

Repository: k2-fsa/sherpa-onnx

Length of output: 939


🏁 Script executed:

#!/bin/bash
# Look at the start of the file for includes and to understand context
head -n 64 sherpa-onnx/csrc/file-utils.cc

Repository: k2-fsa/sherpa-onnx

Length of output: 1509


🏁 Script executed:

#!/bin/bash
# Check the actual definitions of these constants in Windows
python3 << 'EOF'
# On 64-bit systems:
SIZE_MAX_64 = (1 << 64) - 1  # 2^64 - 1
MAXDWORD = (1 << 32) - 1     # 2^32 - 1 = 4,294,967,295

print(f"SIZE_MAX on 64-bit: {SIZE_MAX_64:,} bytes ({SIZE_MAX_64 / (1024**4):.1f} TB)")
print(f"MAXDWORD (ReadFile limit): {MAXDWORD:,} bytes ({MAXDWORD / (1024**3):.2f} GB)")
print(f"\nIf a file is 5GB:")
print(f"  - Passes SIZE_MAX check: YES")
print(f"  - Allocated buffer size: 5GB")
print(f"  - ReadFile call truncates to: {MAXDWORD:,} bytes")
print(f"  - bytes_read: {MAXDWORD:,} bytes")
print(f"  - buffer.size(): 5,368,709,120 bytes")
print(f"  - bytes_read != buffer.size(): TRUE (causes return {})")
print(f"\nResult: File silently fails to read, appearing as read failure")
EOF

Repository: k2-fsa/sherpa-onnx

Length of output: 281


🏁 Script executed:

#!/bin/bash
# Check the exact definitions of these constants in Windows
python3 << 'EOF'
# On 64-bit systems:
SIZE_MAX_64 = (1 << 64) - 1  # 2^64 - 1
MAXDWORD = (1 << 32) - 1     # 2^32 - 1 = 4,294,967,295

print(f"SIZE_MAX on 64-bit: {SIZE_MAX_64:,} bytes ({SIZE_MAX_64 / (1024**4):.1f} TB)")
print(f"MAXDWORD (ReadFile limit): {MAXDWORD:,} bytes ({MAXDWORD / (1024**3):.2f} GB)")
print(f"\nIf a file is 5GB:")
print(f"  - Passes SIZE_MAX check: YES")
print(f"  - Allocated buffer size: 5GB")
print(f"  - ReadFile call truncates to: {MAXDWORD:,} bytes")
print(f"  - bytes_read: {MAXDWORD:,} bytes")
print(f"  - buffer.size(): 5,368,709,120 bytes")
print(f"  - Comparison bytes_read != buffer.size(): TRUE")
print(f"\nResult: File silently fails to read, appearing as read failure")
EOF

Repository: k2-fsa/sherpa-onnx

Length of output: 508


Guard against ReadFile DWORD byte-count limit for large files.

Files larger than 4 GB (MAXDWORD) will silently fail to read. At line 80, ReadFile can read at most ~4.3 GB per call, but the code allocates buffers up to SIZE_MAX (~16 exabytes on 64-bit systems) without checking this limit. For files between 4 GB and SIZE_MAX, the cast static_cast<DWORD>(buffer.size()) truncates silently, bytes_read will be less than buffer.size(), and the check at line 81 will reject the file.

Add a guard to reject files exceeding MAXDWORD before allocation:

Proposed fix
-    if (!GetFileSizeEx(hFile, &file_size) || file_size.QuadPart > SIZE_MAX) {
+    if (!GetFileSizeEx(hFile, &file_size) || file_size.QuadPart < 0 ||
+        file_size.QuadPart > static_cast<LONGLONG>(SIZE_MAX) ||
+        file_size.QuadPart > static_cast<LONGLONG>(MAXDWORD)) {
       return {};
     }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@sherpa-onnx/csrc/file-utils.cc` around lines 73 - 81, The code currently
allocates buffer based on file_size.QuadPart and calls ReadFile with a DWORD
length which will overflow for files > MAXDWORD; add a guard after GetFileSizeEx
to check if file_size.QuadPart > MAXDWORD and return {} (or handle error) before
allocating the std::vector and before casting to DWORD, ensuring ReadFile is
only called with a safe static_cast<DWORD>(buffer.size()) and avoiding silent
truncation; reference GetFileSizeEx, file_size, buffer, ReadFile and MAXDWORD
when making this change.

return {};
}

Comment on lines +76 to +87
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Windows ReadFile casts buffer.size() to DWORD for the ReadFile() call. For files >4GiB this overflows / truncates the requested read size; the function will then fail even though SIZE_MAX is larger. Consider rejecting sizes > DWORD max explicitly or reading in a loop/chunks.

Suggested change
DWORD bytes_read = 0;
if (!::ReadFile(hFile, buffer.data(), static_cast<DWORD>(buffer.size()), &bytes_read, nullptr) ||
bytes_read != buffer.size()) {
return {};
}
size_t total_read = 0;
while (total_read < buffer.size()) {
size_t remaining = buffer.size() - total_read;
DWORD to_read = remaining > static_cast<size_t>(MAXDWORD)
? MAXDWORD
: static_cast<DWORD>(remaining);
DWORD bytes_read = 0;
if (!::ReadFile(hFile, buffer.data() + total_read, to_read, &bytes_read, nullptr)) {
return {};
}
if (bytes_read == 0) {
// Unexpected end of file
return {};
}
total_read += bytes_read;
}

Copilot uses AI. Check for mistakes.
return buffer;
#else
std::ifstream file(filename, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
return {};
}

std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);

std::vector<char> buffer(size);
if (!file.read(buffer.data(), size)) {
std::vector<char> buffer(size);
if (!file.read(buffer.data(), size)) {
return {};
Comment on lines +95 to +103
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the non-Windows ReadFile path, tellg() can return -1 on failure; constructing std::vector<char> buffer(size) would then attempt a huge allocation due to signed-to-unsigned conversion. Check that size >= 0 (and that the stream is in a good state) before allocating.

Copilot uses AI. Check for mistakes.
}

return buffer;
#endif
} catch (const std::exception&) {
return {};
}

return buffer;
}

#if __ANDROID_API__ >= 9
Expand Down Expand Up @@ -119,33 +174,38 @@ std::string ResolveAbsolutePath(const std::string &path) {
return path;
}

try {
#ifdef _WIN32
// Check if path is already absolute (drive letter or UNC path)
if ((path.size() > 1 && path[1] == ':') ||
(path.size() > 1 && path[0] == '\\' && path[1] == '\\')) {
return path;
}

char buffer[MAX_PATH];
if (GetFullPathNameA(path.c_str(), MAX_PATH, buffer, nullptr)) {
return std::string(buffer);
}

return path; // fallback on failure

std::wstring wide_path = ToWideString(path);
DWORD required_size = GetFullPathNameW(wide_path.c_str(), 0, nullptr, nullptr);
if (required_size == 0) {
return path;
}

std::vector<wchar_t> buffer(required_size);
DWORD actual_size = GetFullPathNameW(
wide_path.c_str(),
required_size,
buffer.data(),
nullptr
);

if (actual_size == 0 || actual_size >= required_size) {
return path;
}

std::wstring resolved_wide(buffer.data(), actual_size);
return ToString(resolved_wide);
#else
// POSIX: absolute paths start with '/'
if (path[0] == '/') {
char resolved_path[PATH_MAX];
if (realpath(path.c_str(), resolved_path) == nullptr) {
return path;
}
return std::string(resolved_path);
#endif
Comment on lines +178 to +211
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description mentions refactoring ResolveAbsolutePath to use std::filesystem and lexically_normal(), but the implementation still uses GetFullPathNameW/realpath and does not do lexical normalization. Either update the PR description to match the actual approach, or implement the described std::filesystem-based normalization.

Copilot uses AI. Check for mistakes.
} catch (const std::exception&) {
return path;
}

char buffer[PATH_MAX];
if (realpath(path.c_str(), buffer)) {
return std::string(buffer);
}

return path; // fallback on failure
#endif
}

} // namespace sherpa_onnx