Skip to content

Commit abc998f

Browse files
Encoding: fix b/268011976
1 parent 3614e37 commit abc998f

File tree

12 files changed

+1013
-780
lines changed

12 files changed

+1013
-780
lines changed

client/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ More advanced macro replacement schemes are possible. Please see the
298298
an example.
299299

300300
### Extraction of Heredoc SQL Statements from KSH Inputs
301+
301302
By default, during the preprocessing phase, any input paths ending in
302303
extension `.ksh` will be scanned for heredoc SQL statements. For example,
303304
given the following file named `foo.ksh`:
@@ -312,6 +313,7 @@ echo Trying another select.
312313
```
313314

314315
will be preprocessed to:
316+
315317
```sql
316318
SELECT 123, 'foo', 456 from bar;
317319
```

client/bqms_run/encoding.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Encoding utilities."""
15+
16+
import logging
17+
18+
import icu
19+
20+
logger = logging.getLogger(__name__)
21+
22+
23+
class EncodingDetector:
24+
"""
25+
An encoding detector.
26+
"""
27+
28+
def detect(self, data: bytes) -> str:
29+
"""
30+
Detect the encoding of the provided bytes, return the encoding name.
31+
"""
32+
encoding = icu.CharsetDetector(data).detect().getName()
33+
if not isinstance(encoding, str):
34+
return "utf-8"
35+
return encoding
36+
37+
def decode(self, data: bytes) -> str:
38+
"""
39+
Detect the encoding of the provided bytes, then decode them to string.
40+
"""
41+
encoding = self.detect(data)
42+
logger.debug("Detected encoding: %s", encoding)
43+
return data.decode(encoding)

client/bqms_run/hooks.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
)
2525
from bqms_run.paths import Path
2626

27-
2827
logger = logging.getLogger(__name__)
2928

3029

client/bqms_run/ksh.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from enum import Enum
1919
from typing import List, Optional
2020

21-
2221
logger = logging.getLogger(__name__)
2322

2423

@@ -34,10 +33,7 @@ class ShellFragment:
3433
"""
3534

3635
def __init__(
37-
self, text: str,
38-
line: int,
39-
fragment_type: ShellFragmentType,
40-
quoted: bool
36+
self, text: str, line: int, fragment_type: ShellFragmentType, quoted: bool
4137
) -> None:
4238
super().__init__()
4339
self.text = text
@@ -91,15 +87,19 @@ def __init__(
9187
:param command_replace_to: the suffix of the heredoc command
9288
"""
9389
super().__init__()
94-
self.pattern = self._check(re.compile(
95-
KshExtractor.to_pattern_for_command(command)))
90+
self.pattern = self._check(
91+
re.compile(KshExtractor.to_pattern_for_command(command))
92+
)
9693
self.command_replace_from = command_replace_from
9794
self.command_replace_to = command_replace_to
9895

9996
@staticmethod
10097
def to_pattern_for_command(command: str) -> str:
101-
return KshExtractor.PATTERN_DEFAULT_PREFIX + command \
102-
+ KshExtractor.PATTERN_DEFAULT_SUFFIX
98+
return (
99+
KshExtractor.PATTERN_DEFAULT_PREFIX
100+
+ command
101+
+ KshExtractor.PATTERN_DEFAULT_SUFFIX
102+
)
103103

104104
@staticmethod
105105
def _check(pattern: re.Pattern) -> re.Pattern: # type: ignore[type-arg]
@@ -126,9 +126,10 @@ def filter_heredoc_sql_texts(fragments: List[ShellFragment]) -> List[str]:
126126
return list(
127127
map(
128128
lambda f: f.get_text(),
129-
filter(lambda f:
130-
f.get_fragment_type() == ShellFragmentType.HEREDOC,
131-
fragments),
129+
filter(
130+
lambda f: f.get_fragment_type() == ShellFragmentType.HEREDOC,
131+
fragments,
132+
),
132133
)
133134
)
134135

client/bqms_run/main.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from marshmallow import ValidationError
3030

3131
from bqms_run import workflow
32+
from bqms_run.encoding import EncodingDetector
3233
from bqms_run.gcp.bqms.object_name_mapping import ObjectNameMappingListSchema
3334
from bqms_run.gcp.bqms.request import build as build_bqms_request
3435
from bqms_run.gcp.bqms.source_env import SourceEnvSchema
@@ -78,8 +79,8 @@ def _parse_paths() -> Paths:
7879

7980
def _read_config(config_path: Path) -> Dict[str, object]:
8081
logger.debug("Parsing config: %s.", config_path.as_uri())
81-
with config_path.open(mode="r", encoding="utf-8") as config_file:
82-
config_text = config_file.read()
82+
with config_path.open(mode="rb") as config_file:
83+
config_text = EncodingDetector().decode(config_file.read())
8384
config: Dict[str, object] = yaml.load(config_text, Loader=yaml.SafeLoader)
8485
return config
8586

@@ -147,10 +148,8 @@ def _parse_object_name_mapping(
147148
object_name_mapping_path: Path,
148149
) -> ObjectNameMappingList:
149150
logger.debug("Parsing object name mapping: %s.", object_name_mapping_path.as_uri())
150-
with object_name_mapping_path.open(
151-
mode="r", encoding="utf-8"
152-
) as macro_mapping_file:
153-
object_name_mapping_text = macro_mapping_file.read()
151+
with object_name_mapping_path.open(mode="rb") as macro_mapping_file:
152+
object_name_mapping_text = EncodingDetector().decode(macro_mapping_file.read())
154153
object_name_mapping = json.loads(object_name_mapping_text)
155154
try:
156155
object_name_mapping_list = ObjectNameMappingListSchema.from_mapping(
@@ -172,8 +171,8 @@ def _parse_object_name_mapping(
172171

173172
def _parse_macro_mapping(macro_mapping_path: Path) -> MacroExpanderRouter:
174173
logger.debug("Parsing macro mapping: %s.", macro_mapping_path.as_uri())
175-
with macro_mapping_path.open(mode="r", encoding="utf-8") as macro_mapping_file:
176-
macro_mapping_text = macro_mapping_file.read()
174+
with macro_mapping_path.open(mode="rb") as macro_mapping_file:
175+
macro_mapping_text = EncodingDetector().decode(macro_mapping_file.read())
177176
macro_mapping = yaml.load(macro_mapping_text, Loader=yaml.SafeLoader)
178177
try:
179178
validated_macro_mapping = MacroMapping.from_mapping(macro_mapping)

client/bqms_run/workflow.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from google.cloud.bigquery_migration_v2 import CreateMigrationWorkflowRequest
2121
from typing_extensions import ParamSpec
2222

23+
from bqms_run.encoding import EncodingDetector
2324
from bqms_run.gcp.bqms.request import execute as execute_bqms_request
2425
from bqms_run.macros import MacroExpanderRouter
2526
from bqms_run.paths import Path, Paths, iterdirfiles
@@ -124,8 +125,8 @@ def _preprocess(source_file_path: Path) -> None:
124125
target_file.write(source_bytes)
125126
return
126127

127-
with source_file_path.open(mode="r", encoding="utf-8") as source_file:
128-
source_text = source_file.read()
128+
with source_file_path.open(mode="rb") as source_file:
129+
source_text = EncodingDetector().decode(data=source_file.read())
129130

130131
preprocessed_text = preprocess_hook(relative_file_path, source_text)
131132
macro_expanded_text = (
@@ -147,8 +148,8 @@ def _postprocess(source_file_path: Path) -> None:
147148
target_file_path = paths.postprocessed_path / relative_file_path
148149
target_file_path.parent.mkdir(parents=True, exist_ok=True)
149150

150-
with source_file_path.open(mode="r", encoding="utf-8") as source_file:
151-
source_text = source_file.read()
151+
with source_file_path.open(mode="rb") as source_file:
152+
source_text = EncodingDetector().decode(data=source_file.read())
152153

153154
macro_unexpanded_text = (
154155
macro_expander_router.un_expand(relative_file_path, source_text)

0 commit comments

Comments
 (0)