Skip to content

Commit 038c537

Browse files
yoffCopilot
andcommitted
Python: decouple CFG uniqueness from dataflow layer
Separate concerns between CFG facade (Cfg.qll) and the dataflow layer: - Cfg.qll now exposes the full shared-CFG node set. ControlFlowNode includes all TBeforeNode/TAstNode/TAfterNode variants. toAst is a pure many-to-one translation with no uniqueness filtering. - Dataflow uniqueness is enforced via injects() guards in isExpressionNode and all TNode/TDataFlowCall constructors, ensuring exactly one dataflow node per AST expression. - Guard logic (outcomeOfGuard) restricts to injects representatives to avoid duplicate guard matches. - SSA import-star writes guarded by injects to prevent duplicate uncertain definitions. - Removes isCanonicalAstNodeRepresentative (dead after decoupling). - Fixes pre-existing IfExp dataflow regression: the TBeforeNode representative correctly participates in expressionFlowStep. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 709566e commit 038c537

27 files changed

Lines changed: 445 additions & 619 deletions

File tree

python/ql/lib/semmle/python/controlflow/internal/Cfg.qll

Lines changed: 31 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -42,136 +42,27 @@ module CfgSigImpl implements BB::CfgSig<Py::Location> {
4242
/**
4343
* Gets the Python AST node corresponding to CFG node `n`, if any.
4444
*
45-
* Entry/exit/synthetic CFG nodes have no Python AST node, so this is
46-
* partial.
45+
* Multiple CFG nodes may map to the same AST node (e.g. `TBeforeNode(Call)`
46+
* and `TAstNode(Call)` both map to `Py::Call`). This is a pure translation;
47+
* uniqueness constraints are enforced at the dataflow layer where needed.
4748
*/
4849
private Py::AstNode toAst(CfgImpl::ControlFlowNode n) {
4950
result = CfgImpl::astNodeToPyNode(n.getAstNode())
5051
}
5152

5253
/**
53-
* Holds if `n` is a CFG node representing the canonical position for an
54-
* AST node from the dataflow library's perspective.
54+
* A control flow node.
5555
*
56-
* For most expressions this is the "after"-evaluation point (post-order
57-
* representative). For statements it is the post-order node when one
58-
* exists. We additionally include the synthetic entry/exit nodes for the
59-
* benefit of API consumers that ask "is this the entry node of a scope?".
56+
* This is the full set of CFG nodes from the shared library — it includes
57+
* before-nodes, in-order/post-order nodes, after-value-split nodes, and
58+
* entry/exit nodes. This enables full control-flow-level reasoning and
59+
* compatibility with the shared control-flow reachability library.
6060
*
61-
* In conditional contexts the after-position of a boolean expression
62-
* splits into separate `isAfterTrue` and `isAfterFalse` nodes; both are
63-
* canonical, so a single AST expression may correspond to more than one
64-
* `ControlFlowNode`.
65-
*/
66-
private predicate isCanonical(CfgImpl::ControlFlowNode n) {
67-
n.isAfter(_)
68-
or
69-
n instanceof CfgImpl::ControlFlow::EntryNode
70-
or
71-
n instanceof CfgImpl::ControlFlow::ExitNode
72-
or
73-
// Annotated exit nodes (normal + abnormal) — needed so that dataflow
74-
// consumers can ask "is this the normal-exit of a scope?" and also
75-
// so that scope-exit synthetic uses in SsaImpl can attach here.
76-
n instanceof CfgImpl::ControlFlow::AnnotatedExitNode
77-
}
78-
79-
/**
80-
* Holds if `n` is genuinely the `TAfterValueNode` variant for a boolean-true
81-
* outcome of its AST node.
82-
*
83-
* The shared CFG's `isAfterValue` predicate has a kind-mismatch fallback
84-
* (see `ControlFlowGraph.qll`'s `isAfterValue` lines 870-892): when asking
85-
* `isAfterValue(_, BooleanSuccessor true)` on, say, an emptiness-empty
86-
* variant, it falsely returns `true` because the kinds differ and Python
87-
* does not provide `successorValueImplies`. The same fallback also makes
88-
* `TAfterNode` (the unsplit case) satisfy `isAfterValue(_, t)` for *every*
89-
* `t`.
90-
*
91-
* The combination `isAfterTrue ∧ ¬isAfterFalse` excludes both: a genuine
92-
* boolean-true variant satisfies `isAfterTrue` directly (newtype branch 3)
93-
* but not `isAfterFalse` (the dual variant, same kind, no fallback). A
94-
* non-boolean variant satisfies both via the cross-kind fallback. A
95-
* `TAfterNode` satisfies both via branch 1. So `isAfterTrue ∧ ¬isAfterFalse`
96-
* picks exactly the genuine boolean-true variant.
97-
*/
98-
private predicate isGenuineAfterTrue(ControlFlowNode n) {
99-
n.isAfterTrue(_) and not n.isAfterFalse(_)
100-
}
101-
102-
/**
103-
* Holds if `n` is genuinely the `TAfterValueNode` variant for the "empty"
104-
* outcome of its AST node (e.g. `for x in xs: ...` when `xs` is empty).
105-
*
106-
* See `isGenuineAfterTrue` for why we cannot just use a single
107-
* `isAfterValue` check.
108-
*/
109-
private predicate isGenuineAfterEmpty(ControlFlowNode n) {
110-
exists(EmptinessSuccessor empty |
111-
empty.getValue() = true and n.isAfterValue(n.getAstNode(), empty)
112-
) and
113-
not exists(EmptinessSuccessor nonEmpty |
114-
nonEmpty.getValue() = false and n.isAfterValue(n.getAstNode(), nonEmpty)
115-
)
116-
}
117-
118-
/**
119-
* Holds if `n` is genuinely the `TAfterValueNode` variant for the "matched"
120-
* outcome of its AST node (e.g. a `match` case-pattern that matched).
121-
*
122-
* See `isGenuineAfterTrue` for why we cannot just use a single
123-
* `isAfterValue` check.
124-
*/
125-
private predicate isGenuineAfterMatched(ControlFlowNode n) {
126-
exists(MatchingSuccessor matched |
127-
matched.getValue() = true and n.isAfterValue(n.getAstNode(), matched)
128-
) and
129-
not exists(MatchingSuccessor unmatched |
130-
unmatched.getValue() = false and n.isAfterValue(n.getAstNode(), unmatched)
131-
)
132-
}
133-
134-
/**
135-
* Holds if `n` is the canonical representative of its corresponding AST node
136-
* for dataflow purposes.
137-
*
138-
* The shared CFG associates a single AST node with multiple `ControlFlowNode`s
139-
* when the AST appears in a conditional context (boolean conditions split into
140-
* `afterTrue`/`afterFalse`; for-loop iters split into `[empty]`/`[non-empty]`;
141-
* `match`-case patterns split into `[matched]`/`[unmatched]`). These splits
142-
* matter for control-flow analysis, but for dataflow purposes — where we
143-
* ask "what is the value of this expression?" — a single representative
144-
* suffices and is required to avoid double-counting calls, arguments, store
145-
* steps, etc.
146-
*
147-
* The pick is structural: when an AST has a single `ControlFlowNode` (the
148-
* normal `TAfterNode` or `TBeforeNode`-leaf case), that node is canonical.
149-
* When an AST has a conditional split, the "positive" outcome variant
150-
* (true / empty / matched) is canonical. The three split kinds are mutually
151-
* exclusive per AST, so exactly one variant is selected.
152-
*/
153-
predicate isCanonicalAstNodeRepresentative(ControlFlowNode n) {
154-
// Non-split AST: the unique variant is canonical.
155-
not exists(ControlFlowNode other | other.getNode() = n.getNode() and other != n)
156-
or
157-
// Split AST: pick the "positive" outcome of the split.
158-
isGenuineAfterTrue(n)
159-
or
160-
isGenuineAfterEmpty(n)
161-
or
162-
isGenuineAfterMatched(n)
163-
}
164-
165-
/**
166-
* A control flow node. Control flow nodes have a many-to-one relation
167-
* with syntactic nodes, although most syntactic nodes have only one
168-
* corresponding control flow node.
169-
*
170-
* Edges between control flow nodes include exceptional as well as normal
171-
* control flow.
61+
* AST-level semantics (`getNode()`, `isLoad()`, typed wrappers, etc.)
62+
* are available only on the `injects` (canonical) node for each AST node.
63+
* Non-injects nodes are purely positional CFG nodes with no AST mapping.
17264
*/
17365
class ControlFlowNode extends CfgImpl::ControlFlowNode {
174-
ControlFlowNode() { isCanonical(this) }
17566

17667
/** Gets the syntactic element corresponding to this flow node, if any. */
17768
Py::AstNode getNode() { result = toAst(this) }
@@ -180,36 +71,23 @@ class ControlFlowNode extends CfgImpl::ControlFlowNode {
18071
ControlFlowNode getAPredecessor() { this = result.getASuccessor() }
18172

18273
/** Gets a successor of this flow node. */
183-
pragma[inline]
184-
ControlFlowNode getASuccessor() { result = nextCanonical(this) }
74+
ControlFlowNode getASuccessor() { result = super.getASuccessor() }
18575

18676
/** Gets a successor for this node if the relevant condition is True. */
18777
ControlFlowNode getATrueSuccessor() {
188-
super.isAfterTrue(_) and
189-
exists(CfgImpl::ControlFlowNode other | other.isAfterFalse(super.getAstNode())) and
190-
result = nextCanonical(this)
78+
result = super.getASuccessor(any(BooleanSuccessor t | t.getValue() = true))
19179
}
19280

19381
/** Gets a successor for this node if the relevant condition is False. */
19482
ControlFlowNode getAFalseSuccessor() {
195-
super.isAfterFalse(_) and
196-
exists(CfgImpl::ControlFlowNode other | other.isAfterTrue(super.getAstNode())) and
197-
result = nextCanonical(this)
83+
result = super.getASuccessor(any(BooleanSuccessor t | t.getValue() = false))
19884
}
19985

20086
/** Gets a successor for this node if an exception is raised. */
201-
ControlFlowNode getAnExceptionalSuccessor() {
202-
exists(CfgImpl::ControlFlowNode mid |
203-
mid = super.getAnExceptionSuccessor() and
204-
result = nextCanonicalFrom(mid)
205-
)
206-
}
87+
ControlFlowNode getAnExceptionalSuccessor() { result = super.getAnExceptionSuccessor() }
20788

20889
/** Gets a successor for this node if no exception is raised. */
209-
ControlFlowNode getANormalSuccessor() {
210-
result = this.getASuccessor() and
211-
not result = this.getAnExceptionalSuccessor()
212-
}
90+
ControlFlowNode getANormalSuccessor() { result = super.getANormalSuccessor() }
21391

21492
/** Gets the basic block containing this flow node. */
21593
BasicBlock getBasicBlock() { result = super.getBasicBlock() }
@@ -353,9 +231,6 @@ class ControlFlowNode extends CfgImpl::ControlFlowNode {
353231

354232
/** Holds if this flow node strictly reaches `other`. */
355233
predicate strictlyReaches(ControlFlowNode other) { this.getASuccessor+() = other }
356-
357-
/** Internal: raw successor predicate that does NOT skip non-canonical nodes. */
358-
CfgImpl::ControlFlowNode getASuccessorRaw() { result = super.getASuccessor() }
359234
}
360235

361236
/**
@@ -377,53 +252,33 @@ private predicate augstore(ControlFlowNode load, ControlFlowNode store) {
377252
load = store
378253
}
379254

380-
/**
381-
* Gets the nearest canonical CFG node reachable from `n` via one or more
382-
* raw CFG edges (skipping non-canonical intermediaries).
383-
*/
384-
private CfgImpl::ControlFlowNode nextCanonicalFrom(CfgImpl::ControlFlowNode n) {
385-
result = n.getASuccessor() and isCanonical(result)
386-
or
387-
exists(CfgImpl::ControlFlowNode mid |
388-
mid = n.getASuccessor() and
389-
not isCanonical(mid) and
390-
result = nextCanonicalFrom(mid)
391-
)
392-
}
393-
394-
/** Gets the nearest canonical CFG successor of canonical node `n`. */
395-
private ControlFlowNode nextCanonical(ControlFlowNode n) { result = nextCanonicalFrom(n) }
396-
397255
/**
398256
* A basic block — a maximal-length sequence of control flow nodes such
399257
* that no node except the first has a predecessor outside the sequence,
400258
* and no node except the last has a successor outside the sequence.
401259
*/
402260
class BasicBlock extends CfgImpl::BasicBlock {
403-
/** Gets the `n`th node in this basic block, restricted to canonical nodes. */
404-
ControlFlowNode getNode(int n) {
405-
result = rank[n + 1](ControlFlowNode node, int i | super.getNode(i) = node | node order by i)
406-
}
261+
/** Gets the `n`th node in this basic block. */
262+
ControlFlowNode getNode(int n) { result = super.getNode(n) }
407263

408264
/** Gets a node in this basic block. */
409-
ControlFlowNode getANode() { result = this.getNode(_) }
265+
ControlFlowNode getANode() { result = super.getNode(_) }
410266

411-
/** Gets the first canonical node in this basic block. */
267+
/** Gets the first node in this basic block. */
412268
ControlFlowNode firstNode() { result = this.getNode(0) }
413269

414-
/** Gets the last canonical node in this basic block. */
415-
ControlFlowNode getLastNode() { result = this.getNode(max(int n | exists(this.getNode(n)))) }
270+
/** Gets the last node in this basic block. */
271+
ControlFlowNode getLastNode() { result = super.getLastNode() }
416272

417273
/** Holds if this basic block contains `node`. */
418274
predicate contains(ControlFlowNode node) { node = this.getANode() }
419275

420276
// Inherited from the shared library's `BasicBlock`:
421277
// getASuccessor(), getASuccessor(SuccessorType), getAPredecessor(),
422-
// getNode(int) (raw, includes non-canonical), getANode() (raw),
423278
// strictlyDominates(), dominates(), getImmediateDominator(),
424279
// length(), inLoop().
425-
// We expose canonical-only positional access via `getNode(int)` below
426-
// (shadows the shared-lib version) and additional Python-style helpers.
280+
// We shadow `getNode(int)` etc. to return `ControlFlowNode` (this
281+
// facade's type) and add Python-style helpers below.
427282
/** Gets a true successor to this basic block. */
428283
BasicBlock getATrueSuccessor() {
429284
result = super.getASuccessor(any(BooleanSuccessor t | t.getValue() = true))
@@ -810,8 +665,13 @@ class IfExprNode extends ControlFlowNode {
810665

811666
override Py::IfExp getNode() { result = super.getNode() }
812667

813-
/** Gets the flow node for one of the operands of an if-expression. */
814-
ControlFlowNode getAnOperand() { result = this.getAPredecessor() }
668+
/** Gets the flow node for one of the value operands (true-branch or false-branch). */
669+
ControlFlowNode getAnOperand() {
670+
exists(Py::IfExp ie |
671+
ie = toAst(this) and
672+
(toAst(result) = ie.getBody() or toAst(result) = ie.getOrelse())
673+
)
674+
}
815675
}
816676

817677
/** A control flow node corresponding to an assignment expression (walrus `:=`). */

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,12 +1448,11 @@ private predicate sameEnclosingCallable(Node node1, Node node2) {
14481448
// =============================================================================
14491449
newtype TDataFlowCall =
14501450
TNormalCall(Cfg::CallNode call, Function target, CallType type) {
1451-
resolveCall(call, target, type) and
1452-
Cfg::isCanonicalAstNodeRepresentative(call)
1451+
call.injects(_) and resolveCall(call, target, type)
14531452
} or
14541453
/** A call to the generated function inside a comprehension */
14551454
TComprehensionCall(Comp c) or
1456-
TPotentialLibraryCall(Cfg::CallNode call) { Cfg::isCanonicalAstNodeRepresentative(call) } or
1455+
TPotentialLibraryCall(Cfg::CallNode call) { call.injects(_) } or
14571456
/** A synthesized call inside a summarized callable */
14581457
TSummaryCall(
14591458
FlowSummaryImpl::Public::SummarizedCallable c, FlowSummaryImpl::Private::SummaryNode receiver

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,20 @@ predicate isArgumentNode(ArgumentNode arg, DataFlowCall c, ArgumentPosition pos)
4545
//--------
4646
overlay[local]
4747
predicate isExpressionNode(Cfg::ControlFlowNode node) {
48-
node.getNode() instanceof Expr
49-
or
50-
// `Cfg::ForNode` wraps a `For` statement's iter position, but
51-
// overrides `.getNode()` to return the `Py::For` statement (for
52-
// legacy parity). The underlying AST is still an `Expr` (the iter
53-
// expression); we want a dataflow node here so that for-loop
54-
// content reads (`for y in l`) have a source expression node to
55-
// read content from.
56-
node instanceof Cfg::ForNode
48+
// Restrict to the `injects` representative so the dataflow layer creates
49+
// exactly one `TCfgNode` per AST expression.
50+
node.injects(_) and
51+
(
52+
node.getNode() instanceof Expr
53+
or
54+
// `Cfg::ForNode` wraps a `For` statement's iter position, but
55+
// overrides `.getNode()` to return the `Py::For` statement (for
56+
// legacy parity). The underlying AST is still an `Expr` (the iter
57+
// expression); we want a dataflow node here so that for-loop
58+
// content reads (`for y in l`) have a source expression node to
59+
// read content from.
60+
node instanceof Cfg::ForNode
61+
)
5762
}
5863

5964
// =============================================================================
@@ -418,22 +423,19 @@ module LocalFlow {
418423
// (e.g. boolean-outcome `[true]`/`[false]` or emptiness `[empty]`/`[non-empty]`
419424
// splits of the same AST node), but dataflow values are insensitive to
420425
// those splits — there is at most one `CfgNode` per AST. Project both
421-
// ends through `Cfg::isCanonicalAstNodeRepresentative` so all variants
422-
// contribute their use-use edges to the canonical pair.
426+
// ends through `.getNode()` so all variants contribute their use-use
427+
// edges to the canonical pair.
423428
exists(Cfg::NameNode fromVariant, Cfg::NameNode toVariant |
424429
SsaImpl::AdjacentUses::adjacentUseUse(fromVariant, toVariant) and
425430
fromVariant.getNode() = nodeFrom.getNode() and
426-
toVariant.getNode() = nodeTo.getNode() and
427-
Cfg::isCanonicalAstNodeRepresentative(nodeFrom) and
428-
Cfg::isCanonicalAstNodeRepresentative(nodeTo)
431+
toVariant.getNode() = nodeTo.getNode()
429432
)
430433
}
431434

432435
predicate defToFirstUse(SsaImpl::EssaVariable var, Cfg::NameNode nodeTo) {
433436
exists(Cfg::NameNode toVariant |
434437
SsaImpl::AdjacentUses::firstUse(var.getDefinition(), toVariant) and
435-
toVariant.getNode() = nodeTo.getNode() and
436-
Cfg::isCanonicalAstNodeRepresentative(nodeTo)
438+
toVariant.getNode() = nodeTo.getNode()
437439
)
438440
}
439441

@@ -449,8 +451,7 @@ module LocalFlow {
449451
nodeFrom.(ScopeEntryDefinitionNode).getDefinition() = def
450452
|
451453
SsaImpl::AdjacentUses::firstUse(def, toVariant) and
452-
toVariant.getNode() = nodeTo.(CfgNode).getNode().getNode() and
453-
Cfg::isCanonicalAstNodeRepresentative(nodeTo.(CfgNode).getNode())
454+
toVariant.getNode() = nodeTo.(CfgNode).getNode().getNode()
454455
)
455456
or
456457
// Next use after use

0 commit comments

Comments
 (0)