feat(schema,neo4j): first-class external_symbols; fix dropped call edges (#44)

rahlk · rahlk · commit df0eae935269 · 2026-06-22T17:02:48.000-04:00
Closes #44 Adopt the model codeanalyzer-typescript uses: external call targets are now first-class in the IR instead of being re-derived ad hoc during Neo4j projection. - schema: add PyExternalSymbol{name, module} and PyApplication.external_symbols, keyed by signature (mirrors TSExternalSymbol). - core: _compute_external_symbols() classifies every call-graph endpoint not declared in the symbol table as an external (name/module from the signature), so analysis.json carries external info that was previously a bare target string. - neo4j: :PyExternal gains a `module` property (SCHEMA_VERSION 1.0.0 -> 1.1.0, additive). project()'s _call_endpoint classifies authoritatively from external_symbols rather than a "present in the graph" heuristic, so an imported module name (a :PyPackage) can no longer shadow a call target and silently drop the PY_CALLS edge. - rows: track node identity by (merge_label, value) so deferred PY_EXTENDS / PY_RESOLVES_TO edges can't be shadowed either. Fixes the ~3.7% of call edges (e.g. targets os/re/json) that were dropped from the emitted graph. Adds a regression test and exercises external_symbols in the sample app; regenerates schema.neo4j.json.
diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py
@@ -8,7 +8,13 @@
 
 import ray
 from codeanalyzer.utils import logger
-from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
+from codeanalyzer.schema import (
+    PyApplication,
+    PyExternalSymbol,
+    PyModule,
+    model_dump_json,
+    model_validate_json,
+)
 from codeanalyzer.schema.py_schema import PyCallEdge
 from codeanalyzer.semantic_analysis.call_graph import (
     jedi_call_graph_edges,
@@ -379,6 +385,43 @@ def __exit__(self, *args, **kwargs) -> None:
             logger.info(f"Clearing cache directory: {self.cache_dir}")
             shutil.rmtree(self.cache_dir)
 
+    @staticmethod
+    def _compute_external_symbols(symbol_table, call_graph):
+        """Build the external-symbol map: every call-graph endpoint whose signature
+        is not a declared class/callable in the symbol table is an external (an
+        imported library or builtin member). ``name``/``module`` are derived from
+        the signature (best effort: split on the last dot)."""
+        declared = set()
+
+        def walk_callable(c):
+            declared.add(c.signature)
+            for ic in (c.inner_callables or {}).values():
+                walk_callable(ic)
+            for cl in (c.inner_classes or {}).values():
+                walk_class(cl)
+
+        def walk_class(cl):
+            declared.add(cl.signature)
+            for m in (cl.methods or {}).values():
+                walk_callable(m)
+            for ic in (cl.inner_classes or {}).values():
+                walk_class(ic)
+
+        for mod in symbol_table.values():
+            for c in (mod.functions or {}).values():
+                walk_callable(c)
+            for cl in (mod.classes or {}).values():
+                walk_class(cl)
+
+        externals: Dict[str, PyExternalSymbol] = {}
+        for edge in call_graph:
+            for sig in (edge.source, edge.target):
+                if sig in declared or sig in externals:
+                    continue
+                module, name = sig.rsplit(".", 1) if "." in sig else (sig, sig)
+                externals[sig] = PyExternalSymbol(name=name, module=module)
+        return externals
+
     def analyze(self) -> PyApplication:
         """Analyze the project and return a PyApplication with symbol table.
         
@@ -418,8 +461,19 @@ def analyze(self) -> PyApplication:
         jedi_edges = jedi_call_graph_edges(symbol_table)
         call_graph = merge_edges(jedi_edges, codeql_edges)
 
+        # Classify call-graph endpoints that are not declared in the symbol table
+        # (imported library / builtin members) once, so the JSON and Neo4j backends
+        # share one authoritative external-symbol set.
+        external_symbols = self._compute_external_symbols(symbol_table, call_graph)
+
         # Recreate pyapplication
-        app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
+        app = (
+            PyApplication.builder()
+            .symbol_table(symbol_table)
+            .call_graph(call_graph)
+            .external_symbols(external_symbols)
+            .build()
+        )
         
         # Save to cache
         self._save_analysis_cache(app, cache_file)
diff --git a/codeanalyzer/neo4j/catalog.py b/codeanalyzer/neo4j/catalog.py
@@ -34,7 +34,7 @@
 
 from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
 
-SCHEMA_VERSION = "1.0.0"
+SCHEMA_VERSION = "1.1.0"
 
 # PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}.
 
@@ -119,7 +119,7 @@ class RelType:
         "PyExternal",
         "PySymbol",
         "signature",
-        {"signature": "string", "name": "string"},
+        {"signature": "string", "name": "string", "module": "string"},
     ),
     NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}),
     NodeLabel(
diff --git a/codeanalyzer/neo4j/project.py b/codeanalyzer/neo4j/project.py
@@ -60,11 +60,12 @@ def project(app: PyApplication, app_name: str) -> GraphRows:
         b.edge("PY_HAS_MODULE", app_ref, mod_ref)
         _project_module_body(b, file_key, mod_ref, mod)
 
-    # The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become
-    # :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes).
+    # The aggregated :PY_CALLS twin. Endpoints listed in app.external_symbols become
+    # :PyExternal ghost nodes; the rest are declared :PySymbol nodes already emitted.
+    externals = app.external_symbols or {}
     for e in app.call_graph:
-        src = _call_endpoint(b, e.source)
-        tgt = _call_endpoint(b, e.target)
+        src = _call_endpoint(b, e.source, externals)
+        tgt = _call_endpoint(b, e.target, externals)
         b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or [])))
 
     return b.finish()
@@ -74,13 +75,20 @@ def _sym(signature: str) -> NodeRef:
     return NodeRef("PySymbol", "signature", signature)
 
 
-def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef:
-    """A call-graph endpoint: a known callable already emitted, or a phantom
-    :PyExternal symbol materialized on demand for a ghost target."""
-    if b.has_key(signature):
+def _call_endpoint(b: RowBuilder, signature: str, externals: dict) -> NodeRef:
+    """A call-graph endpoint: a declared callable already emitted, or an external
+    symbol (imported library / builtin member) materialized as a :PyExternal ghost.
+
+    Classification is authoritative -- it comes from ``app.external_symbols``, not a
+    "present in the graph" heuristic -- so an imported module name (which exists only
+    as a :PyPackage) can never shadow the call target. A small fallback still
+    materializes an external for any endpoint that is neither declared nor listed."""
+    ext = externals.get(signature)
+    if ext is None and b.has_key("PySymbol", signature):
         return _sym(signature)
-    name = signature.rsplit(".", 1)[-1] if "." in signature else signature
-    return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name})
+    name = ext.name if ext is not None else (signature.rsplit(".", 1)[-1] if "." in signature else signature)
+    module = ext.module if ext is not None else None
+    return b.node(["PySymbol", "PyExternal"], "signature", signature, prune({"name": name, "module": module}))
 
 
 # ----------------------------------------------------------------------------------------------
diff --git a/codeanalyzer/neo4j/rows.py b/codeanalyzer/neo4j/rows.py
@@ -83,7 +83,11 @@ def __init__(self) -> None:
         self._nodes: Dict[str, NodeRow] = {}  # key: f"{labels[0]} {value}"
         self._edges: List[EdgeRow] = []
         self._deferred: List[EdgeRow] = []  # edges gated against node existence at finish()
-        self._keys: set = set()  # every node value seen, for resolved-gating
+        # (merge_label, value) of every node seen, for resolved-gating. Keyed by
+        # label too so a :PyPackage name can't shadow a :PySymbol signature (and
+        # vice versa) — otherwise a call to an imported module name like ``os``
+        # resolves to a :PySymbol node that was never created and the edge is lost.
+        self._keys: set = set()
 
     def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef:
         """Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props
@@ -98,7 +102,7 @@ def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> No
                     existing.labels.append(label)
         else:
             self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props))
-        self._keys.add(value)
+        self._keys.add((labels[0], value))
         return NodeRef(labels[0], key_prop, value)
 
     def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None:
@@ -121,12 +125,13 @@ def edge_to_symbol(
             )
         )
 
-    def has_key(self, value: str) -> bool:
-        return value in self._keys
+    def has_key(self, label: str, value: str) -> bool:
+        """Whether a node with this ``(merge_label, value)`` identity was emitted."""
+        return (label, value) in self._keys
 
     def finish(self) -> GraphRows:
         for e in self._deferred:
-            if e.to_ref.value in self._keys:
+            if (e.to_ref.label, e.to_ref.value) in self._keys:
                 self._edges.append(e)
         nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}")
         edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}")
diff --git a/codeanalyzer/schema/__init__.py b/codeanalyzer/schema/__init__.py
@@ -8,13 +8,15 @@
     PyClass,
     PyClassAttribute,
     PyComment,
+    PyExternalSymbol,
     PyImport,
     PyModule,
     PyVariableDeclaration,
 )
 
 __all__ = [
     "PyApplication",
+    "PyExternalSymbol",
     "PyImport",
     "PyComment",
     "PyModule",
diff --git a/codeanalyzer/schema/py_schema.py b/codeanalyzer/schema/py_schema.py
@@ -358,10 +358,25 @@ class PyCallEdge(BaseModel):
     provenance: List[Literal["jedi", "codeql", "joern"]] = []
 
 
+@builder
+@msgpk
+class PyExternalSymbol(BaseModel):
+    """A call-graph target outside the analyzed project -- an imported library or
+    builtin member. Mirrors codeanalyzer-typescript's ``TSExternalSymbol`` and is
+    keyed in ``PyApplication.external_symbols`` by its call-graph signature."""
+
+    name: str  # the member/short name, e.g. "get" for "requests.get"
+    module: Optional[str] = None  # best-effort owning module, e.g. "requests"
+
+
 @builder
 @msgpk
 class PyApplication(BaseModel):
     """Represents a Python application."""
 
     symbol_table: Dict[str, PyModule]
     call_graph: List[PyCallEdge] = []
+    # Call-graph endpoints not declared in the symbol table (imported library /
+    # builtin members), keyed by signature. Populated by the analyzer so every
+    # backend (JSON and Neo4j) shares one authoritative external-symbol set.
+    external_symbols: Dict[str, PyExternalSymbol] = {}
diff --git a/schema.neo4j.json b/schema.neo4j.json
@@ -1,5 +1,5 @@
 {
-  "schema_version": "1.0.0",
+  "schema_version": "1.1.0",
   "generator": "codeanalyzer-python",
   "marker_labels": [],
   "node_labels": [
@@ -67,7 +67,8 @@
       "key": "signature",
       "properties": {
         "signature": "string",
-        "name": "string"
+        "name": "string",
+        "module": "string"
       }
     },
     {
diff --git a/test/sample_graph_app.py b/test/sample_graph_app.py
@@ -14,6 +14,7 @@
     PyClass,
     PyClassAttribute,
     PyComment,
+    PyExternalSymbol,
     PyImport,
     PyModule,
     PyVariableDeclaration,
@@ -149,4 +150,7 @@ def make_sample_app() -> PyApplication:
     return PyApplication(
         symbol_table={"src/service.py": service_mod, "src/util.py": util_mod},
         call_graph=call_graph,
+        # The ghost edge's target (requests.get) is a library member, recorded as a
+        # first-class external symbol so the projection emits a :PyExternal for it.
+        external_symbols={"requests.get": PyExternalSymbol(name="get", module="requests")},
     )
diff --git a/test/test_neo4j_schema.py b/test/test_neo4j_schema.py
@@ -12,6 +12,8 @@
 from codeanalyzer.neo4j import NODE_LABELS, REL_TYPES, build_schema_document, project
 from codeanalyzer.neo4j.catalog import MARKER_LABELS
 from codeanalyzer.neo4j.cypher import render_cypher
+from codeanalyzer.schema import PyApplication, PyCallable, PyImport, PyModule
+from codeanalyzer.schema.py_schema import PyCallEdge
 
 from sample_graph_app import make_sample_app
 
@@ -87,6 +89,38 @@ def test_render_cypher_is_deterministic_and_self_contained():
     assert "MERGE (n:PySymbol {signature: row.k})" in a
 
 
+def test_call_edge_to_imported_module_name_is_not_dropped():
+    """Regression for #44: a call whose target is a bare module name that is also
+    imported (e.g. ``os``) must not be dropped. The import creates a :PyPackage
+    named ``os``; that must not shadow the call target's :PySymbol signature."""
+    caller = PyCallable(
+        name="caller", path="m.py", signature="m.caller", return_type="None",
+        code="def caller():\n    os.getcwd()", start_line=1, end_line=2,
+        code_start_line=1, cyclomatic_complexity=1,
+    )
+    mod = PyModule(
+        file_path="m.py", module_name="m",
+        imports=[PyImport(module="os", name="getcwd")],
+        functions={"caller": caller},
+        content_hash="h", last_modified=1.0, file_size=10,
+    )
+    app = PyApplication(
+        symbol_table={"m.py": mod},
+        call_graph=[PyCallEdge(source="m.caller", target="os", weight=1, provenance=["jedi"])],
+    )
+    rows = project(app, "app")
+
+    calls_to_os = [e for e in rows.edges if e.type == "PY_CALLS" and e.to_ref.value == "os"]
+    assert len(calls_to_os) == 1, "PY_CALLS edge to imported module name 'os' was dropped"
+
+    # 'os' is materialized as a :PyExternal symbol (the call target) ...
+    assert any(n.value == "os" and "PyExternal" in n.labels for n in rows.nodes), \
+        ":PyExternal ghost for the call target 'os' is missing"
+    # ... distinct from the :PyPackage 'os' created by the import.
+    assert any(n.value == "os" and "PyPackage" in n.labels for n in rows.nodes), \
+        ":PyPackage for the import 'os' is missing"
+
+
 def test_checked_in_schema_matches_catalog():
     """Run `canpy --emit schema > schema.neo4j.json` if this fails."""
     on_disk_path = Path(__file__).resolve().parents[1] / "schema.neo4j.json"

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "schema_version": "1.0.0",`
	`2`	`+ "schema_version": "1.1.0",`
`3`	`3`	`"generator": "codeanalyzer-python",`
`4`	`4`	`"marker_labels": [],`
`5`	`5`	`"node_labels": [`
`@@ -67,7 +67,8 @@`
`67`	`67`	`"key": "signature",`
`68`	`68`	`"properties": {`
`69`	`69`	`"signature": "string",`
`70`		`- "name": "string"`
	`70`	`+ "name": "string",`
	`71`	`+ "module": "string"`
`71`	`72`	`}`
`72`	`73`	`},`
`73`	`74`	`{`