Skip to content

Commit 7e1467e

Browse files
authored
handle Python concatenated strings in Rust dependency inference parser (#22050)
As reported in #20324, the Rust-based Python import parser does not handle certain valid source patterns including concatenated strings. Fix the Rust-based parser to handle concatenated strings correctly. Strings are parsed out of `Node`s via the new `extract_string` helper. The `insert_import` function is refactored to avoid double parsing of strings. Added some tests as well for concatenated strings.
1 parent 8a81c64 commit 7e1467e

File tree

3 files changed

+125
-43
lines changed

3 files changed

+125
-43
lines changed

docs/notes/2.26.x.md

+2
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ The Python backend uses the PyPI `packaging` distribution to parse Python versio
112112

113113
The Python Build Standalone backend (`pants.backend.python.providers.experimental.python_build_standalone`) has release metadata current through PBS release `20250212`.
114114

115+
The Rust-based dependency inference logic now handles concatenated string literals properly and so imports such as `__import__("foo" ".bar")` will be properly handled.
116+
115117
#### Shell
116118

117119
The `experiemental_test_shell_command` target type is no longer experimental and is stabilized as the `test_shell_command` target type. As part of stablilizing it, `test_shell_command` learned several new features:

src/rust/engine/dep_inference/src/python/mod.rs

+86-36
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright 2023 Pants project contributors (see CONTRIBUTORS.md).
22
// Licensed under the Apache License, Version 2.0 (see LICENSE).
3+
use std::borrow::Cow;
34
use std::path::PathBuf;
45

56
use fnv::{FnvHashMap as HashMap, FnvHashSet as HashSet};
@@ -76,6 +77,12 @@ struct ImportCollector<'a> {
7677
weaken_imports: bool,
7778
}
7879

80+
/// Input to `insert_import` to specify the base of an import statement.
81+
enum BaseNode<'a> {
82+
Node(tree_sitter::Node<'a>),
83+
StringNode(String, tree_sitter::Node<'a>),
84+
}
85+
7986
impl ImportCollector<'_> {
8087
pub fn new(code: &'_ str) -> ImportCollector<'_> {
8188
ImportCollector {
@@ -102,11 +109,46 @@ impl ImportCollector<'_> {
102109
code::at_range(self.code, range)
103110
}
104111

105-
fn string_at(&self, range: tree_sitter::Range) -> &str {
106-
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
107-
self.code_at(range)
108-
.trim_start_matches(|c| "rRuUfFbB".contains(c))
109-
.trim_matches(|c| "'\"".contains(c))
112+
/// Extract an optional "import name" string from a string-related `tree_sitter::Node`. The string is
113+
/// expected to analyze statically as an import name and so `extract_string` ignores any strings
114+
/// containing escape sequences as well as any use of f-strings with interpolations.
115+
fn extract_string(&self, node: tree_sitter::Node) -> Option<String> {
116+
match node.kind_id() {
117+
KindID::STRING => {
118+
if node.child_count() != 3 {
119+
// String literals are expected to have exactly three children consisting of `string_start`, `string_content`,
120+
// and `string_end`. If there are more children, then it means that `interpolation` nodes are present, which
121+
// means that the string is an f-string and not an import name that may be analyzed statically.
122+
return None;
123+
}
124+
125+
let content_node = node.child(1)?;
126+
if content_node.kind_id() != KindID::STRING_CONTENT
127+
|| content_node.child_count() > 0
128+
{
129+
// The `string_content` node is expected to have no children if the string is a simple string literal.
130+
// If there are children, then it means that the string contains escape sequences and is not an import name
131+
// (either an `escape_sequence` or `escape_interpolation` node).
132+
return None;
133+
}
134+
135+
let content = code::at_range(self.code, content_node.range());
136+
Some(content.to_string())
137+
}
138+
KindID::CONCATENATED_STRING => {
139+
let substrings: Vec<Option<String>> = node
140+
.children(&mut node.walk())
141+
.filter(|n| n.kind_id() != KindID::COMMENT)
142+
.map(|n| self.extract_string(n))
143+
.collect();
144+
if substrings.iter().any(|s| s.is_none()) {
145+
return None;
146+
}
147+
let substrings: Vec<String> = substrings.into_iter().flatten().collect();
148+
Some(substrings.join(""))
149+
}
150+
_ => None,
151+
}
110152
}
111153

112154
fn is_pragma_ignored_at_row(&self, node: tree_sitter::Node, end_row: usize) -> bool {
@@ -170,36 +212,34 @@ impl ImportCollector<'_> {
170212
/// from $base import * # (the * node is passed as `specific` too)
171213
/// from $base import $specific
172214
/// ```
173-
fn insert_import(
174-
&mut self,
175-
base: tree_sitter::Node,
176-
specific: Option<tree_sitter::Node>,
177-
is_string: bool,
178-
) {
215+
fn insert_import(&mut self, base: BaseNode, specific: Option<tree_sitter::Node>) {
179216
// the specifically-imported item takes precedence over the base name for ignoring and lines
180217
// etc.
181-
let most_specific = specific.unwrap_or(base);
218+
let most_specific = match base {
219+
BaseNode::Node(n) => specific.unwrap_or(n),
220+
BaseNode::StringNode(_, n) => specific.unwrap_or(n),
221+
};
182222

183223
if self.is_pragma_ignored(most_specific) {
184224
return;
185225
}
186226

187-
let base = ImportCollector::unnest_alias(base);
227+
let base_ref = match base {
228+
BaseNode::Node(node) => {
229+
let node = Self::unnest_alias(node);
230+
Cow::Borrowed(self.code_at(node.range()))
231+
}
232+
BaseNode::StringNode(s, _) => Cow::Owned(s),
233+
};
234+
188235
// * and errors are the same as not having an specific import
189236
let specific = specific
190237
.map(ImportCollector::unnest_alias)
191238
.filter(|n| !matches!(n.kind_id(), KindID::WILDCARD_IMPORT | KindID::ERROR));
192239

193-
let base_range = base.range();
194-
let base_ref = if is_string {
195-
self.string_at(base_range)
196-
} else {
197-
self.code_at(base_range)
198-
};
199-
200240
let full_name = match specific {
201241
Some(specific) => {
202-
let specific_ref = self.code_at(specific.range());
242+
let specific_ref = Cow::Borrowed(self.code_at(specific.range()));
203243
// `from ... import a` => `...a` should concat base_ref and specific_ref directly, but `from
204244
// x import a` => `x.a` needs to insert a . between them
205245
let joiner = if base_ref.ends_with('.') { "" } else { "." };
@@ -215,13 +255,24 @@ impl ImportCollector<'_> {
215255
.and_modify(|v| *v = (v.0, v.1 && self.weaken_imports))
216256
.or_insert(((line0 as u64) + 1, self.weaken_imports));
217257
}
258+
259+
fn handle_string_candidate(&mut self, node: tree_sitter::Node) {
260+
if let Some(text) = self.extract_string(node) {
261+
if !text.contains(|c: char| c.is_ascii_whitespace() || c == '\\')
262+
&& !self.is_pragma_ignored_recursive(node)
263+
{
264+
self.string_candidates
265+
.insert(text, (node.range().start_point.row + 1) as u64);
266+
}
267+
}
268+
}
218269
}
219270

220271
// NB: https://tree-sitter.github.io/tree-sitter/playground is very helpful
221272
impl Visitor for ImportCollector<'_> {
222273
fn visit_import_statement(&mut self, node: tree_sitter::Node) -> ChildBehavior {
223274
if !self.is_pragma_ignored(node) {
224-
self.insert_import(node.named_child(0).unwrap(), None, false);
275+
self.insert_import(BaseNode::Node(node.named_child(0).unwrap()), None);
225276
}
226277
ChildBehavior::Ignore
227278
}
@@ -236,7 +287,7 @@ impl Visitor for ImportCollector<'_> {
236287

237288
let mut any_inserted = false;
238289
for child in node.children_by_field_name("name", &mut node.walk()) {
239-
self.insert_import(module_name, Some(child), false);
290+
self.insert_import(BaseNode::Node(module_name), Some(child));
240291
any_inserted = true;
241292
}
242293

@@ -246,7 +297,7 @@ impl Visitor for ImportCollector<'_> {
246297
// manually.)
247298
for child in node.children(&mut node.walk()) {
248299
if child.kind_id() == KindID::WILDCARD_IMPORT {
249-
self.insert_import(module_name, Some(child), false);
300+
self.insert_import(BaseNode::Node(module_name), Some(child));
250301
any_inserted = true
251302
}
252303
}
@@ -257,7 +308,7 @@ impl Visitor for ImportCollector<'_> {
257308
// understood the syntax tree! We're working on a definite import statement, so silently
258309
// doing nothing with it is likely to be wrong. Let's insert the import node itself and let
259310
// that be surfaced as an dep-inference failure.
260-
self.insert_import(node, None, false)
311+
self.insert_import(BaseNode::Node(node), None)
261312
}
262313
}
263314
ChildBehavior::Ignore
@@ -340,25 +391,24 @@ impl Visitor for ImportCollector<'_> {
340391

341392
let args = node.named_child(1).unwrap();
342393
if let Some(arg) = args.named_child(0) {
343-
if arg.kind_id() == KindID::STRING {
344-
// NB: Call nodes are children of expression nodes. The comment is a sibling of the expression.
394+
if let Some(content) = self.extract_string(arg) {
345395
if !self.is_pragma_ignored(node.parent().unwrap()) {
346-
self.insert_import(arg, None, true);
396+
self.insert_import(BaseNode::StringNode(content, arg), None);
347397
}
348398
}
349399
}
400+
401+
// Do not descend below the `__import__` call statement.
402+
ChildBehavior::Ignore
403+
}
404+
405+
fn visit_concatenated_string(&mut self, node: tree_sitter::Node) -> ChildBehavior {
406+
self.handle_string_candidate(node);
350407
ChildBehavior::Ignore
351408
}
352409

353410
fn visit_string(&mut self, node: tree_sitter::Node) -> ChildBehavior {
354-
let range = node.range();
355-
let text: &str = self.string_at(range);
356-
if !text.contains(|c: char| c.is_ascii_whitespace() || c == '\\')
357-
&& !self.is_pragma_ignored_recursive(node)
358-
{
359-
self.string_candidates
360-
.insert(text.to_string(), (range.start_point.row + 1) as u64);
361-
}
411+
self.handle_string_candidate(node);
362412
ChildBehavior::Ignore
363413
}
364414
}

src/rust/engine/dep_inference/src/python/tests.rs

+37-7
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,24 @@ fn dunder_import() {
279279
) # pants: no-infer-dep",
280280
&[],
281281
);
282+
283+
// Concatenated strings are accepted.
284+
assert_imports(
285+
"
286+
__import__(
287+
'foo' # Intervening comment will be ignored.
288+
\".bar\"
289+
\"\"\".xyzzy\"\"\"
290+
'''.funky'''
291+
)
292+
",
293+
&["foo.bar.xyzzy.funky"],
294+
);
295+
296+
// f-strings with interpolations are ignored
297+
assert_imports("__import__(f'foo.{name}')", &[]);
298+
assert_imports("__import__(f'foo.{{name}}')", &[]);
299+
assert_imports("__import__(f'foo.bar\n')", &[]);
282300
}
283301

284302
fn assert_imports_strong_weak(code: &str, strong: &[&str], weak: &[&str]) {
@@ -678,6 +696,10 @@ fn string_candidates() {
678696
assert_strings("'''\na'''", &[]);
679697
assert_strings("'''a\n'''", &[]);
680698

699+
// Concatenated strings
700+
assert_strings("'foo' \".bar\"", &["foo.bar"]);
701+
assert_strings("'''foo''' \"\"\".bar\"\"\"", &["foo.bar"]);
702+
681703
// Technically the value of the string doesn't contain whitespace, but the parser isn't that
682704
// sophisticated yet.
683705
assert_strings("'''\\\na'''", &[]);
@@ -707,6 +729,8 @@ fn string_candidates() {
707729
",
708730
&[],
709731
);
732+
assert_strings("'foo' \".bar\" # pants: no-infer-dep", &[]);
733+
assert_strings("'''foo''' \"\"\".bar\"\"\" # pants: no-infer-dep", &[]);
710734
}
711735

712736
#[test]
@@ -722,6 +746,11 @@ fn python2() {
722746
__import__(b'treat.as.a.regular.import.not.a.string.import')
723747
__import__(u'{}'.format('interpolation'))
724748
749+
__import__(
750+
'foo' # Intervening comment will be ignored.
751+
'.bar'
752+
)
753+
725754
importlib.import_module(b'dep.from.bytes')
726755
importlib.import_module(u'dep.from.str')
727756
importlib.import_module(u'dep.from.str_狗')
@@ -737,15 +766,16 @@ fn python2() {
737766
("project.demo.Demo", (5, false)),
738767
("pkg_resources", (7, false)),
739768
("treat.as.a.regular.import.not.a.string.import", (8, false)),
740-
("weak1", (17, true)),
741-
("strong1", (18, false)),
742-
("strong2", (19, false)),
743-
("strong3", (20, false)),
769+
("weak1", (22, true)),
770+
("strong1", (23, false)),
771+
("strong2", (24, false)),
772+
("strong3", (25, false)),
773+
("foo.bar", (12, false)),
744774
]),
745775
HashMap::from([
746-
("dep.from.bytes", 11),
747-
("dep.from.str", 12),
748-
("dep.from.str_狗", 13),
776+
("dep.from.bytes", 16),
777+
("dep.from.str", 17),
778+
("dep.from.str_狗", 18),
749779
]),
750780
);
751781
}

0 commit comments

Comments
 (0)