Skip to content

Commit df0eae9

Browse files
committed
feat(schema,neo4j): first-class external_symbols; fix dropped call edges (#44)
Closes #44 Adopt the model codeanalyzer-typescript uses: external call targets are now first-class in the IR instead of being re-derived ad hoc during Neo4j projection. - schema: add PyExternalSymbol{name, module} and PyApplication.external_symbols, keyed by signature (mirrors TSExternalSymbol). - core: _compute_external_symbols() classifies every call-graph endpoint not declared in the symbol table as an external (name/module from the signature), so analysis.json carries external info that was previously a bare target string. - neo4j: :PyExternal gains a `module` property (SCHEMA_VERSION 1.0.0 -> 1.1.0, additive). project()'s _call_endpoint classifies authoritatively from external_symbols rather than a "present in the graph" heuristic, so an imported module name (a :PyPackage) can no longer shadow a call target and silently drop the PY_CALLS edge. - rows: track node identity by (merge_label, value) so deferred PY_EXTENDS / PY_RESOLVES_TO edges can't be shadowed either. Fixes the ~3.7% of call edges (e.g. targets os/re/json) that were dropped from the emitted graph. Adds a regression test and exercises external_symbols in the sample app; regenerates schema.neo4j.json.
1 parent 0850160 commit df0eae9

9 files changed

Lines changed: 144 additions & 21 deletions

File tree

codeanalyzer/core.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@
88

99
import ray
1010
from codeanalyzer.utils import logger
11-
from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
11+
from codeanalyzer.schema import (
12+
PyApplication,
13+
PyExternalSymbol,
14+
PyModule,
15+
model_dump_json,
16+
model_validate_json,
17+
)
1218
from codeanalyzer.schema.py_schema import PyCallEdge
1319
from codeanalyzer.semantic_analysis.call_graph import (
1420
jedi_call_graph_edges,
@@ -379,6 +385,43 @@ def __exit__(self, *args, **kwargs) -> None:
379385
logger.info(f"Clearing cache directory: {self.cache_dir}")
380386
shutil.rmtree(self.cache_dir)
381387

388+
@staticmethod
389+
def _compute_external_symbols(symbol_table, call_graph):
390+
"""Build the external-symbol map: every call-graph endpoint whose signature
391+
is not a declared class/callable in the symbol table is an external (an
392+
imported library or builtin member). ``name``/``module`` are derived from
393+
the signature (best effort: split on the last dot)."""
394+
declared = set()
395+
396+
def walk_callable(c):
397+
declared.add(c.signature)
398+
for ic in (c.inner_callables or {}).values():
399+
walk_callable(ic)
400+
for cl in (c.inner_classes or {}).values():
401+
walk_class(cl)
402+
403+
def walk_class(cl):
404+
declared.add(cl.signature)
405+
for m in (cl.methods or {}).values():
406+
walk_callable(m)
407+
for ic in (cl.inner_classes or {}).values():
408+
walk_class(ic)
409+
410+
for mod in symbol_table.values():
411+
for c in (mod.functions or {}).values():
412+
walk_callable(c)
413+
for cl in (mod.classes or {}).values():
414+
walk_class(cl)
415+
416+
externals: Dict[str, PyExternalSymbol] = {}
417+
for edge in call_graph:
418+
for sig in (edge.source, edge.target):
419+
if sig in declared or sig in externals:
420+
continue
421+
module, name = sig.rsplit(".", 1) if "." in sig else (sig, sig)
422+
externals[sig] = PyExternalSymbol(name=name, module=module)
423+
return externals
424+
382425
def analyze(self) -> PyApplication:
383426
"""Analyze the project and return a PyApplication with symbol table.
384427
@@ -418,8 +461,19 @@ def analyze(self) -> PyApplication:
418461
jedi_edges = jedi_call_graph_edges(symbol_table)
419462
call_graph = merge_edges(jedi_edges, codeql_edges)
420463

464+
# Classify call-graph endpoints that are not declared in the symbol table
465+
# (imported library / builtin members) once, so the JSON and Neo4j backends
466+
# share one authoritative external-symbol set.
467+
external_symbols = self._compute_external_symbols(symbol_table, call_graph)
468+
421469
# Recreate pyapplication
422-
app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
470+
app = (
471+
PyApplication.builder()
472+
.symbol_table(symbol_table)
473+
.call_graph(call_graph)
474+
.external_symbols(external_symbols)
475+
.build()
476+
)
423477

424478
# Save to cache
425479
self._save_analysis_cache(app, cache_file)

codeanalyzer/neo4j/catalog.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
3636

37-
SCHEMA_VERSION = "1.0.0"
37+
SCHEMA_VERSION = "1.1.0"
3838

3939
# PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}.
4040

@@ -119,7 +119,7 @@ class RelType:
119119
"PyExternal",
120120
"PySymbol",
121121
"signature",
122-
{"signature": "string", "name": "string"},
122+
{"signature": "string", "name": "string", "module": "string"},
123123
),
124124
NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}),
125125
NodeLabel(

codeanalyzer/neo4j/project.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,12 @@ def project(app: PyApplication, app_name: str) -> GraphRows:
6060
b.edge("PY_HAS_MODULE", app_ref, mod_ref)
6161
_project_module_body(b, file_key, mod_ref, mod)
6262

63-
# The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become
64-
# :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes).
63+
# The aggregated :PY_CALLS twin. Endpoints listed in app.external_symbols become
64+
# :PyExternal ghost nodes; the rest are declared :PySymbol nodes already emitted.
65+
externals = app.external_symbols or {}
6566
for e in app.call_graph:
66-
src = _call_endpoint(b, e.source)
67-
tgt = _call_endpoint(b, e.target)
67+
src = _call_endpoint(b, e.source, externals)
68+
tgt = _call_endpoint(b, e.target, externals)
6869
b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or [])))
6970

7071
return b.finish()
@@ -74,13 +75,20 @@ def _sym(signature: str) -> NodeRef:
7475
return NodeRef("PySymbol", "signature", signature)
7576

7677

77-
def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef:
78-
"""A call-graph endpoint: a known callable already emitted, or a phantom
79-
:PyExternal symbol materialized on demand for a ghost target."""
80-
if b.has_key(signature):
78+
def _call_endpoint(b: RowBuilder, signature: str, externals: dict) -> NodeRef:
79+
"""A call-graph endpoint: a declared callable already emitted, or an external
80+
symbol (imported library / builtin member) materialized as a :PyExternal ghost.
81+
82+
Classification is authoritative -- it comes from ``app.external_symbols``, not a
83+
"present in the graph" heuristic -- so an imported module name (which exists only
84+
as a :PyPackage) can never shadow the call target. A small fallback still
85+
materializes an external for any endpoint that is neither declared nor listed."""
86+
ext = externals.get(signature)
87+
if ext is None and b.has_key("PySymbol", signature):
8188
return _sym(signature)
82-
name = signature.rsplit(".", 1)[-1] if "." in signature else signature
83-
return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name})
89+
name = ext.name if ext is not None else (signature.rsplit(".", 1)[-1] if "." in signature else signature)
90+
module = ext.module if ext is not None else None
91+
return b.node(["PySymbol", "PyExternal"], "signature", signature, prune({"name": name, "module": module}))
8492

8593

8694
# ----------------------------------------------------------------------------------------------

codeanalyzer/neo4j/rows.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,11 @@ def __init__(self) -> None:
8383
self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}"
8484
self._edges: List[EdgeRow] = []
8585
self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish()
86-
self._keys: set = set() # every node value seen, for resolved-gating
86+
# (merge_label, value) of every node seen, for resolved-gating. Keyed by
87+
# label too so a :PyPackage name can't shadow a :PySymbol signature (and
88+
# vice versa) — otherwise a call to an imported module name like ``os``
89+
# resolves to a :PySymbol node that was never created and the edge is lost.
90+
self._keys: set = set()
8791

8892
def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef:
8993
"""Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props
@@ -98,7 +102,7 @@ def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> No
98102
existing.labels.append(label)
99103
else:
100104
self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props))
101-
self._keys.add(value)
105+
self._keys.add((labels[0], value))
102106
return NodeRef(labels[0], key_prop, value)
103107

104108
def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None:
@@ -121,12 +125,13 @@ def edge_to_symbol(
121125
)
122126
)
123127

124-
def has_key(self, value: str) -> bool:
125-
return value in self._keys
128+
def has_key(self, label: str, value: str) -> bool:
129+
"""Whether a node with this ``(merge_label, value)`` identity was emitted."""
130+
return (label, value) in self._keys
126131

127132
def finish(self) -> GraphRows:
128133
for e in self._deferred:
129-
if e.to_ref.value in self._keys:
134+
if (e.to_ref.label, e.to_ref.value) in self._keys:
130135
self._edges.append(e)
131136
nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}")
132137
edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}")

codeanalyzer/schema/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
PyClass,
99
PyClassAttribute,
1010
PyComment,
11+
PyExternalSymbol,
1112
PyImport,
1213
PyModule,
1314
PyVariableDeclaration,
1415
)
1516

1617
__all__ = [
1718
"PyApplication",
19+
"PyExternalSymbol",
1820
"PyImport",
1921
"PyComment",
2022
"PyModule",

codeanalyzer/schema/py_schema.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,10 +358,25 @@ class PyCallEdge(BaseModel):
358358
provenance: List[Literal["jedi", "codeql", "joern"]] = []
359359

360360

361+
@builder
362+
@msgpk
363+
class PyExternalSymbol(BaseModel):
364+
"""A call-graph target outside the analyzed project -- an imported library or
365+
builtin member. Mirrors codeanalyzer-typescript's ``TSExternalSymbol`` and is
366+
keyed in ``PyApplication.external_symbols`` by its call-graph signature."""
367+
368+
name: str # the member/short name, e.g. "get" for "requests.get"
369+
module: Optional[str] = None # best-effort owning module, e.g. "requests"
370+
371+
361372
@builder
362373
@msgpk
363374
class PyApplication(BaseModel):
364375
"""Represents a Python application."""
365376

366377
symbol_table: Dict[str, PyModule]
367378
call_graph: List[PyCallEdge] = []
379+
# Call-graph endpoints not declared in the symbol table (imported library /
380+
# builtin members), keyed by signature. Populated by the analyzer so every
381+
# backend (JSON and Neo4j) shares one authoritative external-symbol set.
382+
external_symbols: Dict[str, PyExternalSymbol] = {}

schema.neo4j.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"schema_version": "1.0.0",
2+
"schema_version": "1.1.0",
33
"generator": "codeanalyzer-python",
44
"marker_labels": [],
55
"node_labels": [
@@ -67,7 +67,8 @@
6767
"key": "signature",
6868
"properties": {
6969
"signature": "string",
70-
"name": "string"
70+
"name": "string",
71+
"module": "string"
7172
}
7273
},
7374
{

test/sample_graph_app.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
PyClass,
1515
PyClassAttribute,
1616
PyComment,
17+
PyExternalSymbol,
1718
PyImport,
1819
PyModule,
1920
PyVariableDeclaration,
@@ -149,4 +150,7 @@ def make_sample_app() -> PyApplication:
149150
return PyApplication(
150151
symbol_table={"src/service.py": service_mod, "src/util.py": util_mod},
151152
call_graph=call_graph,
153+
# The ghost edge's target (requests.get) is a library member, recorded as a
154+
# first-class external symbol so the projection emits a :PyExternal for it.
155+
external_symbols={"requests.get": PyExternalSymbol(name="get", module="requests")},
152156
)

test/test_neo4j_schema.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from codeanalyzer.neo4j import NODE_LABELS, REL_TYPES, build_schema_document, project
1313
from codeanalyzer.neo4j.catalog import MARKER_LABELS
1414
from codeanalyzer.neo4j.cypher import render_cypher
15+
from codeanalyzer.schema import PyApplication, PyCallable, PyImport, PyModule
16+
from codeanalyzer.schema.py_schema import PyCallEdge
1517

1618
from sample_graph_app import make_sample_app
1719

@@ -87,6 +89,38 @@ def test_render_cypher_is_deterministic_and_self_contained():
8789
assert "MERGE (n:PySymbol {signature: row.k})" in a
8890

8991

92+
def test_call_edge_to_imported_module_name_is_not_dropped():
93+
"""Regression for #44: a call whose target is a bare module name that is also
94+
imported (e.g. ``os``) must not be dropped. The import creates a :PyPackage
95+
named ``os``; that must not shadow the call target's :PySymbol signature."""
96+
caller = PyCallable(
97+
name="caller", path="m.py", signature="m.caller", return_type="None",
98+
code="def caller():\n os.getcwd()", start_line=1, end_line=2,
99+
code_start_line=1, cyclomatic_complexity=1,
100+
)
101+
mod = PyModule(
102+
file_path="m.py", module_name="m",
103+
imports=[PyImport(module="os", name="getcwd")],
104+
functions={"caller": caller},
105+
content_hash="h", last_modified=1.0, file_size=10,
106+
)
107+
app = PyApplication(
108+
symbol_table={"m.py": mod},
109+
call_graph=[PyCallEdge(source="m.caller", target="os", weight=1, provenance=["jedi"])],
110+
)
111+
rows = project(app, "app")
112+
113+
calls_to_os = [e for e in rows.edges if e.type == "PY_CALLS" and e.to_ref.value == "os"]
114+
assert len(calls_to_os) == 1, "PY_CALLS edge to imported module name 'os' was dropped"
115+
116+
# 'os' is materialized as a :PyExternal symbol (the call target) ...
117+
assert any(n.value == "os" and "PyExternal" in n.labels for n in rows.nodes), \
118+
":PyExternal ghost for the call target 'os' is missing"
119+
# ... distinct from the :PyPackage 'os' created by the import.
120+
assert any(n.value == "os" and "PyPackage" in n.labels for n in rows.nodes), \
121+
":PyPackage for the import 'os' is missing"
122+
123+
90124
def test_checked_in_schema_matches_catalog():
91125
"""Run `canpy --emit schema > schema.neo4j.json` if this fails."""
92126
on_disk_path = Path(__file__).resolve().parents[1] / "schema.neo4j.json"

0 commit comments

Comments
 (0)