Skip to content

Commit 8cf087b

Browse files
Fix UnicodeDecodeError When Reading Job Log Files (#45)
1 parent 5e4cf16 commit 8cf087b

3 files changed

Lines changed: 47 additions & 8 deletions

File tree

src/swiss_ai_model_launch/launchers/firecrest_launcher.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
from swiss_ai_model_launch.launchers.launch_args import LaunchArgs
1010
from swiss_ai_model_launch.launchers.launch_request import LaunchRequest
1111
from swiss_ai_model_launch.launchers.launcher import JobStatus, Launcher
12-
from swiss_ai_model_launch.launchers.utils import create_salt, render_job_script
12+
from swiss_ai_model_launch.launchers.utils import (
13+
create_salt,
14+
decode_log,
15+
render_job_script,
16+
)
1317

1418
_REMOTE_MODEL_REGISTRY = Path("/capstor/store/cscs/swissai/infra01/hf_models/models/")
1519

@@ -211,8 +215,8 @@ async def get_job_logs(self, job_id: int) -> tuple[str, str]:
211215
account=self.account,
212216
blocking=True,
213217
)
214-
with open(target_dir_path / "log.out") as out_f:
215-
out_log = out_f.read()
218+
with open(target_dir_path / "log.out", "rb") as out_f:
219+
out_log = decode_log(out_f.read())
216220
except FileNotFoundError:
217221
out_log = ""
218222

@@ -224,8 +228,8 @@ async def get_job_logs(self, job_id: int) -> tuple[str, str]:
224228
account=self.account,
225229
blocking=True,
226230
)
227-
with open(target_dir_path / "log.err") as err_f:
228-
err_log = err_f.read()
231+
with open(target_dir_path / "log.err", "rb") as err_f:
232+
err_log = decode_log(err_f.read())
229233
except FileNotFoundError:
230234
err_log = ""
231235

src/swiss_ai_model_launch/launchers/slurm_launcher.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
from swiss_ai_model_launch.launchers.launch_args import LaunchArgs
99
from swiss_ai_model_launch.launchers.launch_request import LaunchRequest
1010
from swiss_ai_model_launch.launchers.launcher import JobStatus, Launcher
11-
from swiss_ai_model_launch.launchers.utils import create_salt, render_job_script
11+
from swiss_ai_model_launch.launchers.utils import (
12+
create_salt,
13+
decode_log,
14+
render_job_script,
15+
)
1216

1317
_REMOTE_MODEL_REGISTRY = Path("/capstor/store/cscs/swissai/infra01/hf_models/models/")
1418

@@ -198,12 +202,12 @@ async def get_job_logs(self, job_id: int) -> tuple[str, str]:
198202
log_dir = self._get_working_dir() / "logs" / str(job_id)
199203

200204
try:
201-
out_log = (log_dir / "log.out").read_text()
205+
out_log = decode_log((log_dir / "log.out").read_bytes())
202206
except FileNotFoundError:
203207
out_log = ""
204208

205209
try:
206-
err_log = (log_dir / "log.err").read_text()
210+
err_log = decode_log((log_dir / "log.err").read_bytes())
207211
except FileNotFoundError:
208212
err_log = ""
209213

src/swiss_ai_model_launch/launchers/utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,34 @@ def create_salt(length: int) -> str:
1616
def render_job_script(launch_args: LaunchArgs) -> str:
1717
template = Template(_TEMPLATE_PATH.read_text())
1818
return str(template.render(**launch_args.model_dump()))
19+
20+
21+
def decode_log(data: bytes) -> str:
22+
"""Decode log bytes to string, tolerating partial UTF-8 sequences at the tail.
23+
24+
Log files may be read while the writer is mid-flush, leaving an incomplete
25+
multi-byte UTF-8 sequence at the end. This function strips any such trailing
26+
incomplete sequence before decoding, then falls back to errors="replace" for
27+
any other malformed bytes in the content.
28+
"""
29+
# Strip trailing incomplete multi-byte UTF-8 sequence.
30+
# Walk backwards past continuation bytes (10xxxxxx), then check whether the
31+
# leading byte at that position expects more continuation bytes than are present.
32+
i = len(data) - 1
33+
num_continuation = 0
34+
while i >= 0 and (data[i] & 0xC0) == 0x80:
35+
num_continuation += 1
36+
i -= 1
37+
if i >= 0:
38+
lead = data[i]
39+
if lead & 0xE0 == 0xC0:
40+
expected = 1
41+
elif lead & 0xF0 == 0xE0:
42+
expected = 2
43+
elif lead & 0xF8 == 0xF0:
44+
expected = 3
45+
else:
46+
expected = num_continuation # single-byte or already complete
47+
if num_continuation < expected:
48+
data = data[:i]
49+
return data.decode("utf-8", errors="replace")

0 commit comments

Comments
 (0)