Skip to content

Commit 7b7be97

Browse files
authored
fix: preserve unicode characters in uploaded filenames (#2737)
* fix * add comments and switch to NFC * doc: add copilot suggested docstring * copilot: fix issue * fix: backslash error by using raw docstring
1 parent e65e96f commit 7b7be97

File tree

4 files changed

+199
-4
lines changed

4 files changed

+199
-4
lines changed

taipy/gui/gui.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
import markdown as md_lib
3434
import tzlocal
35-
from werkzeug.utils import secure_filename
3635

3736
import __main__ # noqa: F401
3837
from taipy.common import _module_exists
@@ -89,6 +88,7 @@
8988
_LocalsContext,
9089
_MapDict,
9190
_patch_value,
91+
_secure_filename_unicode,
9292
_setscopeattr,
9393
_setscopeattr_drill,
9494
_TaipyBase,
@@ -1122,7 +1122,7 @@ def _upload_files(self):
11221122
upload_path = Path(upload_path).resolve()
11231123
os.makedirs(upload_path, exist_ok=True)
11241124
# Save file into upload_path directory
1125-
file_path = _get_non_existent_file_path(upload_path, secure_filename(file.filename))
1125+
file_path = _get_non_existent_file_path(upload_path, _secure_filename_unicode(file.filename))
11261126
self._server.save_uploaded_file(file, os.path.join(upload_path, (file_path.name + suffix)))
11271127
else:
11281128
_warn(f"upload files: Path {path} points outside of upload root.")

taipy/gui/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from .datatype import _get_data_type
3030
from .date import _date_to_string, _string_to_date
3131
from .expr_var_name import _get_expr_var_name
32-
from .filename import _get_non_existent_file_path
32+
from .filename import _get_non_existent_file_path, _secure_filename_unicode
3333
from .filter_locals import _filter_locals
3434
from .get_imported_var import _get_imported_var
3535
from .get_module_name import _get_module_name_from_frame, _get_module_name_from_imported_var

taipy/gui/utils/filename.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,36 @@
99
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
1010
# specific language governing permissions and limitations under the License.
1111

12+
import os
13+
import re
14+
import unicodedata
1215
from pathlib import Path
1316

17+
_WINDOWS_DEVICE_FILES = {
18+
"CON",
19+
"PRN",
20+
"AUX",
21+
"NUL",
22+
"COM1",
23+
"COM2",
24+
"COM3",
25+
"COM4",
26+
"COM5",
27+
"COM6",
28+
"COM7",
29+
"COM8",
30+
"COM9",
31+
"LPT1",
32+
"LPT2",
33+
"LPT3",
34+
"LPT4",
35+
"LPT5",
36+
"LPT6",
37+
"LPT7",
38+
"LPT8",
39+
"LPT9",
40+
}
41+
1442

1543
def _get_non_existent_file_path(dir_path: Path, file_name: str) -> Path:
1644
if not file_name:
@@ -23,3 +51,50 @@ def _get_non_existent_file_path(dir_path: Path, file_name: str) -> Path:
2351
file_path = dir_path / f"{file_stem}.{index}{file_suffix}"
2452
index += 1
2553
return file_path
54+
55+
56+
def _secure_filename_unicode(filename: str) -> str:
57+
r"""
58+
Sanitizes a filename for safe filesystem use while preserving Unicode characters.
59+
60+
This function removes or replaces characters that are invalid or unsafe for filenames,
61+
normalizes Unicode characters, replaces path separators with spaces, and ensures
62+
compatibility with Windows device filenames.
63+
64+
Parameters:
65+
filename (str): The original filename to sanitize.
66+
67+
Returns:
68+
str: The sanitized filename, safe for use on most filesystems. Returns an empty string
69+
if the input is invalid or results in an empty filename after sanitization.
70+
71+
Important:
72+
- Preserves Unicode characters using NFC normalization.
73+
- Removes invalid characters (e.g., < > : " / \ | ? * and control characters).
74+
- Replaces path separators with spaces.
75+
- Converts whitespace to underscores and collapses multiple underscores.
76+
- Strips leading/trailing dots and underscores.
77+
- On Windows, prepends an underscore if the filename matches a reserved device name.
78+
"""
79+
filename = unicodedata.normalize("NFC", filename)
80+
81+
for sep in os.sep, os.path.altsep:
82+
if sep:
83+
filename = filename.replace(sep, " ")
84+
85+
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f\x7f]', "", filename)
86+
87+
# Replace all whitespace (including newlines, tabs) with spaces
88+
filename = re.sub(r"\s+", " ", filename)
89+
90+
# Convert spaces to underscores and clean up multiple underscores
91+
filename = re.sub(r"_+", "_", filename.replace(" ", "_"))
92+
93+
# Remove leading/trailing dots and underscores
94+
filename = filename.strip("._")
95+
96+
# Windows device file check
97+
if os.name == "nt" and filename and filename.split(".")[0].upper() in _WINDOWS_DEVICE_FILES:
98+
filename = f"_{filename}"
99+
100+
return filename

tests/gui/utils/test_filename.py

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@
99
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
1010
# specific language governing permissions and limitations under the License.
1111

12+
import os
1213
import pathlib
1314
import tempfile
1415

16+
import pytest
17+
1518
from taipy.gui import Gui
16-
from taipy.gui.utils import _get_non_existent_file_path
19+
from taipy.gui.utils import _get_non_existent_file_path, _secure_filename_unicode
1720

1821

1922
def test_empty_file_name(gui: Gui, helpers):
@@ -39,3 +42,120 @@ def test_existent_file(gui: Gui, helpers):
3942
assert file_path.exists()
4043
file_path = _get_non_existent_file_path(pathlib.Path(tempfile.gettempdir()), "")
4144
assert file_path.name == f"{file_stem}.{index + 2}{file_suffix}"
45+
46+
47+
@pytest.mark.parametrize(
48+
"input_filename,expected_output",
49+
[
50+
("normal_file.txt", "normal_file.txt"),
51+
("file with spaces.txt", "file_with_spaces.txt"),
52+
("file.with.dots.txt", "file.with.dots.txt"),
53+
],
54+
)
55+
def test_secure_filename_unicode_valid_cases(gui: Gui, helpers, input_filename, expected_output):
56+
"""Test that valid filenames pass through unchanged"""
57+
assert _secure_filename_unicode(input_filename) == expected_output
58+
59+
60+
@pytest.mark.parametrize(
61+
"input_filename,expected_output",
62+
[
63+
('file<>:"/\\|?*.txt', "file_.txt"),
64+
("file<s>.txt", "files.txt"),
65+
("file:name.txt", "filename.txt"),
66+
('file"name".txt', "filename.txt"),
67+
("file|name.txt", "filename.txt"),
68+
("file\x00\x1f\x7f.txt", "file.txt"),
69+
],
70+
)
71+
def test_secure_filename_unicode_special_chars(gui: Gui, helpers, input_filename, expected_output):
72+
"""Test removal of forbidden and control characters"""
73+
assert _secure_filename_unicode(input_filename) == expected_output
74+
75+
76+
@pytest.mark.parametrize(
77+
"input_filename,expected_output",
78+
[
79+
("café.txt", "café.txt"),
80+
("naïve.txt", "naïve.txt"),
81+
("résumé.txt", "résumé.txt"),
82+
("测试文件.txt", "测试文件.txt"),
83+
("файл.txt", "файл.txt"),
84+
("café", "café"),
85+
("poup\u00e9e", "poupée"),
86+
("cafe\u0301", "café"),
87+
],
88+
)
89+
def test_secure_filename_unicode_preservation(gui: Gui, helpers, input_filename, expected_output):
90+
"""Test Unicode character preservation and normalization"""
91+
assert _secure_filename_unicode(input_filename) == expected_output
92+
93+
94+
@pytest.mark.parametrize(
95+
"input_filename,expected_output",
96+
[
97+
(" file .txt ", "file_.txt"),
98+
("file with spaces.txt", "file_with_spaces.txt"),
99+
("file\nwith\ttabs.txt", "filewithtabs.txt"),
100+
("..file.txt", "file.txt"),
101+
("file..txt", "file..txt"),
102+
("__file__.txt", "file_.txt"),
103+
(".file_.txt", "file_.txt"),
104+
],
105+
)
106+
def test_secure_filename_unicode_whitespace_cleanup(gui: Gui, helpers, input_filename, expected_output):
107+
"""Test whitespace handling and leading/trailing character cleanup"""
108+
assert _secure_filename_unicode(input_filename) == expected_output
109+
110+
111+
@pytest.mark.parametrize(
112+
"input_filename,expected_output",
113+
[
114+
("", ""),
115+
(" ", ""),
116+
("...", ""),
117+
("___", ""),
118+
('<>:"/\\|?*', ""),
119+
],
120+
)
121+
def test_secure_filename_unicode_empty_cases(gui: Gui, helpers, input_filename, expected_output):
122+
"""Test edge cases that result in empty filenames"""
123+
assert _secure_filename_unicode(input_filename) == expected_output
124+
125+
126+
@pytest.mark.parametrize(
127+
"input_filename,expected_output",
128+
[
129+
("My Document (Final Version).pdf", "My_Document_(Final_Version).pdf"),
130+
("Report 2024: Q1 Results.xlsx", "Report_2024_Q1_Results.xlsx"),
131+
('User\'s File: "Important".docx', "User's_File_Important.docx"),
132+
("café & naïve résumé.pdf", "café_&_naïve_résumé.pdf"),
133+
],
134+
)
135+
def test_secure_filename_unicode_real_world_cases(gui: Gui, helpers, input_filename, expected_output):
136+
"""Test realistic filename scenarios with mixed issues"""
137+
assert _secure_filename_unicode(input_filename) == expected_output
138+
139+
140+
@pytest.mark.parametrize(
141+
"input_filename,expected_output_windows,expected_output_non_windows",
142+
[
143+
("CON.txt", "_CON.txt", "CON.txt"),
144+
("PRN.txt", "_PRN.txt", "PRN.txt"),
145+
("AUX.txt", "_AUX.txt", "AUX.txt"),
146+
("NUL.txt", "_NUL.txt", "NUL.txt"),
147+
("COM1.txt", "_COM1.txt", "COM1.txt"),
148+
("LPT1.txt", "_LPT1.txt", "LPT1.txt"),
149+
("con.txt", "_con.txt", "con.txt"),
150+
("Con.Txt", "_Con.Txt", "Con.Txt"),
151+
],
152+
)
153+
def test_secure_filename_unicode_windows_device_files(
154+
gui: Gui, helpers, input_filename, expected_output_windows, expected_output_non_windows
155+
):
156+
"""Test Windows device file handling (platform-specific)"""
157+
158+
if os.name == "nt":
159+
assert _secure_filename_unicode(input_filename) == expected_output_windows
160+
else:
161+
assert _secure_filename_unicode(input_filename) == expected_output_non_windows

0 commit comments

Comments
 (0)