Skip to content

Commit 41cc449

Browse files
authored
Merge pull request #48 from codellm-devkit/fix/issues-44-45-46-47
Analysis venv (uv + Jedi wiring), external_symbols, app-scoped prune, --no-venv (#44 #45 #46 #47)
2 parents aa60bd7 + 63cf46f commit 41cc449

16 files changed

Lines changed: 271 additions & 39 deletions

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ $ canpy --help
185185
│ [default: lazy] │
186186
│ --skip-tests --include-tests Skip test files in analysis. │
187187
│ [default: skip-tests] │
188+
│ --no-venv --venv Skip virtualenv creation and │
189+
│ dependency installation; resolve │
190+
│ imports against the ambient Python │
191+
│ environment instead. │
192+
│ [default: venv] │
188193
│ --file-name PATH Analyze only the specified file │
189194
│ (relative to input directory). │
190195
│ --cache-dir -c PATH Directory to store analysis cache. │

codeanalyzer/__main__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ def main(
104104
help="Skip test files in analysis.",
105105
),
106106
] = True,
107+
no_venv: Annotated[
108+
bool,
109+
typer.Option(
110+
"--no-venv/--venv",
111+
help="Skip virtualenv creation and dependency installation; resolve "
112+
"imports against the ambient Python environment instead.",
113+
),
114+
] = False,
107115
file_name: Annotated[
108116
Optional[Path],
109117
typer.Option(
@@ -144,6 +152,7 @@ def main(
144152
using_ray=using_ray,
145153
rebuild_analysis=rebuild_analysis,
146154
skip_tests=skip_tests,
155+
no_venv=no_venv,
147156
file_name=file_name,
148157
cache_dir=cache_dir,
149158
clear_cache=clear_cache,

codeanalyzer/core.py

Lines changed: 100 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@
88

99
import ray
1010
from codeanalyzer.utils import logger
11-
from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
11+
from codeanalyzer.schema import (
12+
PyApplication,
13+
PyExternalSymbol,
14+
PyModule,
15+
model_dump_json,
16+
model_validate_json,
17+
)
1218
from codeanalyzer.schema.py_schema import PyCallEdge
1319
from codeanalyzer.semantic_analysis.call_graph import (
1420
jedi_call_graph_edges,
@@ -60,6 +66,7 @@ def __init__(self, options: AnalysisOptions) -> None:
6066
self.skip_tests = options.skip_tests
6167
self.using_codeql = options.using_codeql
6268
self.rebuild_analysis = options.rebuild_analysis
69+
self.no_venv = options.no_venv
6370
self.cache_dir = (
6471
options.cache_dir.resolve() if options.cache_dir is not None else self.project_dir
6572
) / ".codeanalyzer"
@@ -226,13 +233,41 @@ def _get_base_interpreter() -> Path:
226233
f"a working Python interpreter that can create virtual environments."
227234
)
228235

236+
@staticmethod
237+
def _uv_bin() -> Optional[str]:
238+
"""Path to a uv binary: the one bundled with the ``uv`` PyPI package (a
239+
dependency, so normally always present -- including inside a Docker image),
240+
else a uv on PATH, else ``None`` (callers fall back to pip)."""
241+
try:
242+
from uv import find_uv_bin
243+
244+
return str(find_uv_bin())
245+
except Exception:
246+
return shutil.which("uv")
247+
248+
def _install_into_venv(self, venv_python: Path, args: List[str]) -> None:
249+
"""Install packages into the target venv, preferring uv for speed (parallel
250+
downloads + a shared global cache) and falling back to the venv's own pip
251+
when uv is unavailable."""
252+
uv = self._uv_bin()
253+
if uv:
254+
cmd = [uv, "pip", "install", "--python", str(venv_python), *args]
255+
else:
256+
cmd = [str(venv_python), "-m", "pip", "install", *args]
257+
self._cmd_exec_helper(cmd, cwd=self.project_dir, check=True)
258+
229259
def __enter__(self) -> "Codeanalyzer":
230260
# If no virtualenv is provided, try to create one using requirements.txt or pyproject.toml
231261
venv_path = self.cache_dir / self.project_dir.name / "virtualenv"
232262
# Ensure the cache directory exists for this project
233263
venv_path.parent.mkdir(parents=True, exist_ok=True)
264+
if self.no_venv:
265+
logger.info(
266+
"--no-venv: using the ambient Python environment "
267+
"(skipping virtualenv creation and dependency installation)"
268+
)
234269
# Create the virtual environment if it does not exist
235-
if not venv_path.exists() or self.rebuild_analysis:
270+
if not self.no_venv and (not venv_path.exists() or self.rebuild_analysis):
236271
logger.info(f"(Re-)creating virtual environment at {venv_path}")
237272
self._cmd_exec_helper(
238273
[str(self._get_base_interpreter()), "-m", "venv", str(venv_path)],
@@ -249,24 +284,19 @@ def __enter__(self) -> "Codeanalyzer":
249284
("test-requirements.txt", ["-r"]),
250285
]
251286

252-
for dep_file, pip_args in dependency_files:
287+
for dep_file, _ in dependency_files:
253288
if (self.project_dir / dep_file).exists():
254289
logger.info(f"Installing dependencies from {dep_file}")
255-
self._cmd_exec_helper(
256-
[str(venv_python), "-m", "pip", "install", "-U"] + pip_args + [str(self.project_dir / dep_file)],
257-
cwd=self.project_dir,
258-
check=True,
290+
self._install_into_venv(
291+
venv_python,
292+
["--upgrade", "-r", str(self.project_dir / dep_file)],
259293
)
260294

261295
# Handle Pipenv files
262296
if (self.project_dir / "Pipfile").exists():
263297
logger.info("Installing dependencies from Pipfile")
264298
# Note: This would require pipenv to be installed
265-
self._cmd_exec_helper(
266-
[str(venv_python), "-m", "pip", "install", "pipenv"],
267-
cwd=self.project_dir,
268-
check=True,
269-
)
299+
self._install_into_venv(venv_python, ["pipenv"])
270300
self._cmd_exec_helper(
271301
["pipenv", "install", "--dev"],
272302
cwd=self.project_dir,
@@ -289,14 +319,18 @@ def __enter__(self) -> "Codeanalyzer":
289319

290320
if any((self.project_dir / file).exists() for file in package_definition_files):
291321
logger.info("Installing project in editable mode")
292-
self._cmd_exec_helper(
293-
[str(venv_python), "-m", "pip", "install", "-e", str(self.project_dir)],
294-
cwd=self.project_dir,
295-
check=True,
296-
)
322+
self._install_into_venv(venv_python, ["-e", str(self.project_dir)])
297323
else:
298324
logger.warning("No package definition files found, skipping editable installation")
299325

326+
# Point Jedi at the analysis venv so it resolves the project's third-party
327+
# imports. This runs on both a fresh build and a lazy reuse of an existing
328+
# venv -- previously self.virtualenv stayed None, so the install above was
329+
# never actually used by the symbol-table builder. With --no-venv we leave
330+
# it None so Jedi resolves against the ambient interpreter instead.
331+
if not self.no_venv and venv_path.exists():
332+
self.virtualenv = venv_path
333+
300334
if self.using_codeql:
301335
logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
302336

@@ -358,6 +392,43 @@ def __exit__(self, *args, **kwargs) -> None:
358392
logger.info(f"Clearing cache directory: {self.cache_dir}")
359393
shutil.rmtree(self.cache_dir)
360394

395+
@staticmethod
396+
def _compute_external_symbols(symbol_table, call_graph):
397+
"""Build the external-symbol map: every call-graph endpoint whose signature
398+
is not a declared class/callable in the symbol table is an external (an
399+
imported library or builtin member). ``name``/``module`` are derived from
400+
the signature (best effort: split on the last dot)."""
401+
declared = set()
402+
403+
def walk_callable(c):
404+
declared.add(c.signature)
405+
for ic in (c.inner_callables or {}).values():
406+
walk_callable(ic)
407+
for cl in (c.inner_classes or {}).values():
408+
walk_class(cl)
409+
410+
def walk_class(cl):
411+
declared.add(cl.signature)
412+
for m in (cl.methods or {}).values():
413+
walk_callable(m)
414+
for ic in (cl.inner_classes or {}).values():
415+
walk_class(ic)
416+
417+
for mod in symbol_table.values():
418+
for c in (mod.functions or {}).values():
419+
walk_callable(c)
420+
for cl in (mod.classes or {}).values():
421+
walk_class(cl)
422+
423+
externals: Dict[str, PyExternalSymbol] = {}
424+
for edge in call_graph:
425+
for sig in (edge.source, edge.target):
426+
if sig in declared or sig in externals:
427+
continue
428+
module, name = sig.rsplit(".", 1) if "." in sig else (sig, sig)
429+
externals[sig] = PyExternalSymbol(name=name, module=module)
430+
return externals
431+
361432
def analyze(self) -> PyApplication:
362433
"""Analyze the project and return a PyApplication with symbol table.
363434
@@ -397,8 +468,19 @@ def analyze(self) -> PyApplication:
397468
jedi_edges = jedi_call_graph_edges(symbol_table)
398469
call_graph = merge_edges(jedi_edges, codeql_edges)
399470

471+
# Classify call-graph endpoints that are not declared in the symbol table
472+
# (imported library / builtin members) once, so the JSON and Neo4j backends
473+
# share one authoritative external-symbol set.
474+
external_symbols = self._compute_external_symbols(symbol_table, call_graph)
475+
400476
# Recreate pyapplication
401-
app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
477+
app = (
478+
PyApplication.builder()
479+
.symbol_table(symbol_table)
480+
.call_graph(call_graph)
481+
.external_symbols(external_symbols)
482+
.build()
483+
)
402484

403485
# Save to cache
404486
self._save_analysis_cache(app, cache_file)

codeanalyzer/neo4j/bolt.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ def session():
7777
for stmt in [*CONSTRAINTS, *INDEXES]:
7878
s.run(stmt)
7979

80+
# The application anchor (a shared node) — used to scope the orphan prune
81+
# so it never touches modules belonging to a different :PyApplication.
82+
app_name = next(
83+
(n.value for n in rows.nodes if n.labels and n.labels[0] == "PyApplication"),
84+
None,
85+
)
86+
8087
# Partition nodes by owning module; shared nodes have no _module.
8188
by_module: Dict[str, List[NodeRow]] = {}
8289
shared: List[NodeRow] = []
@@ -135,13 +142,17 @@ def _purge(tx, module=m, node_keys=keys):
135142
_upsert_edges(session, neo4j, edges)
136143

137144
# 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted).
138-
if full_run:
145+
# Scope to THIS application's anchor so a full run for application B never
146+
# deletes application A's modules from a shared database.
147+
if full_run and app_name is not None:
139148
present = list(by_module.keys())
140149
with session() as s:
141150
res = s.run(
142-
"MATCH (m:PyModule) WHERE NOT m.file_key IN $present "
151+
"MATCH (:PyApplication {name: $app})-[:PY_HAS_MODULE]->(m:PyModule) "
152+
"WHERE NOT m.file_key IN $present "
143153
f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m "
144154
"RETURN count(m) AS pruned",
155+
app=app_name,
145156
present=present,
146157
)
147158
pruned = res.single()

codeanalyzer/neo4j/catalog.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
3636

37-
SCHEMA_VERSION = "1.0.0"
37+
SCHEMA_VERSION = "1.1.0"
3838

3939
# PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}.
4040

@@ -119,7 +119,7 @@ class RelType:
119119
"PyExternal",
120120
"PySymbol",
121121
"signature",
122-
{"signature": "string", "name": "string"},
122+
{"signature": "string", "name": "string", "module": "string"},
123123
),
124124
NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}),
125125
NodeLabel(

codeanalyzer/neo4j/project.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,12 @@ def project(app: PyApplication, app_name: str) -> GraphRows:
6060
b.edge("PY_HAS_MODULE", app_ref, mod_ref)
6161
_project_module_body(b, file_key, mod_ref, mod)
6262

63-
# The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become
64-
# :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes).
63+
# The aggregated :PY_CALLS twin. Endpoints listed in app.external_symbols become
64+
# :PyExternal ghost nodes; the rest are declared :PySymbol nodes already emitted.
65+
externals = app.external_symbols or {}
6566
for e in app.call_graph:
66-
src = _call_endpoint(b, e.source)
67-
tgt = _call_endpoint(b, e.target)
67+
src = _call_endpoint(b, e.source, externals)
68+
tgt = _call_endpoint(b, e.target, externals)
6869
b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or [])))
6970

7071
return b.finish()
@@ -74,13 +75,20 @@ def _sym(signature: str) -> NodeRef:
7475
return NodeRef("PySymbol", "signature", signature)
7576

7677

77-
def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef:
78-
"""A call-graph endpoint: a known callable already emitted, or a phantom
79-
:PyExternal symbol materialized on demand for a ghost target."""
80-
if b.has_key(signature):
78+
def _call_endpoint(b: RowBuilder, signature: str, externals: dict) -> NodeRef:
79+
"""A call-graph endpoint: a declared callable already emitted, or an external
80+
symbol (imported library / builtin member) materialized as a :PyExternal ghost.
81+
82+
Classification is authoritative -- it comes from ``app.external_symbols``, not a
83+
"present in the graph" heuristic -- so an imported module name (which exists only
84+
as a :PyPackage) can never shadow the call target. A small fallback still
85+
materializes an external for any endpoint that is neither declared nor listed."""
86+
ext = externals.get(signature)
87+
if ext is None and b.has_key("PySymbol", signature):
8188
return _sym(signature)
82-
name = signature.rsplit(".", 1)[-1] if "." in signature else signature
83-
return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name})
89+
name = ext.name if ext is not None else (signature.rsplit(".", 1)[-1] if "." in signature else signature)
90+
module = ext.module if ext is not None else None
91+
return b.node(["PySymbol", "PyExternal"], "signature", signature, prune({"name": name, "module": module}))
8492

8593

8694
# ----------------------------------------------------------------------------------------------

codeanalyzer/neo4j/rows.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,11 @@ def __init__(self) -> None:
8383
self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}"
8484
self._edges: List[EdgeRow] = []
8585
self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish()
86-
self._keys: set = set() # every node value seen, for resolved-gating
86+
# (merge_label, value) of every node seen, for resolved-gating. Keyed by
87+
# label too so a :PyPackage name can't shadow a :PySymbol signature (and
88+
# vice versa) — otherwise a call to an imported module name like ``os``
89+
# resolves to a :PySymbol node that was never created and the edge is lost.
90+
self._keys: set = set()
8791

8892
def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef:
8993
"""Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props
@@ -98,7 +102,7 @@ def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> No
98102
existing.labels.append(label)
99103
else:
100104
self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props))
101-
self._keys.add(value)
105+
self._keys.add((labels[0], value))
102106
return NodeRef(labels[0], key_prop, value)
103107

104108
def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None:
@@ -121,12 +125,13 @@ def edge_to_symbol(
121125
)
122126
)
123127

124-
def has_key(self, value: str) -> bool:
125-
return value in self._keys
128+
def has_key(self, label: str, value: str) -> bool:
129+
"""Whether a node with this ``(merge_label, value)`` identity was emitted."""
130+
return (label, value) in self._keys
126131

127132
def finish(self) -> GraphRows:
128133
for e in self._deferred:
129-
if e.to_ref.value in self._keys:
134+
if (e.to_ref.label, e.to_ref.value) in self._keys:
130135
self._edges.append(e)
131136
nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}")
132137
edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}")

codeanalyzer/options/options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class AnalysisOptions:
3838
using_ray: bool = False
3939
rebuild_analysis: bool = False
4040
skip_tests: bool = True
41+
no_venv: bool = False
4142
file_name: Optional[Path] = None
4243
cache_dir: Optional[Path] = None
4344
clear_cache: bool = False

codeanalyzer/schema/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
PyClass,
99
PyClassAttribute,
1010
PyComment,
11+
PyExternalSymbol,
1112
PyImport,
1213
PyModule,
1314
PyVariableDeclaration,
1415
)
1516

1617
__all__ = [
1718
"PyApplication",
19+
"PyExternalSymbol",
1820
"PyImport",
1921
"PyComment",
2022
"PyModule",

0 commit comments

Comments
 (0)