From 42dff5ecc093b7828808ad5c33cdbc20acdec51f Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 9 Apr 2026 04:52:41 +0000 Subject: [PATCH 01/10] feat: add `sync_infiniops.py` script for syncing operator wrappers from InfiniOps --- scripts/sync_infiniops.py | 252 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 scripts/sync_infiniops.py diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py new file mode 100644 index 000000000..f54dd8443 --- /dev/null +++ b/scripts/sync_infiniops.py @@ -0,0 +1,252 @@ +"""Sync operator implementations from InfiniOps into InfiniCore. + +This script invokes InfiniOps's code generator to produce legacy C API wrappers, +then copies the generated headers and source files into InfiniCore's tree, +replacing the hand-written operator dispatch files. + +Usage: + python scripts/sync_infiniops.py /path/to/InfiniOps [--devices cpu nvidia ...] + python scripts/sync_infiniops.py /path/to/InfiniOps --ops gemm add + python scripts/sync_infiniops.py /path/to/InfiniOps --dry-run +""" + +import argparse +import difflib +import pathlib +import shutil +import subprocess +import sys + +INFINICORE_ROOT = pathlib.Path(__file__).resolve().parent.parent +INFINICORE_INCLUDE_OPS = INFINICORE_ROOT / "include" / "infiniop" / "ops" +INFINICORE_SRC_OPS = INFINICORE_ROOT / "src" / "infiniop" / "ops" + + +def run_generator(infiniops_root, devices): + """Run InfiniOps's `generate_wrappers.py` and return the generated directory.""" + generator = infiniops_root / "scripts" / "generate_wrappers.py" + + if not generator.exists(): + print(f"Error: generator not found at {generator}", file=sys.stderr) + sys.exit(1) + + cmd = [sys.executable, str(generator), "--devices"] + devices + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, cwd=infiniops_root, capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error: generator failed:\n{result.stderr}", file=sys.stderr) + sys.exit(1) + + generated_dir = infiniops_root / "generated" + + if not generated_dir.exists(): + print(f"Error: expected output directory {generated_dir} not found", file=sys.stderr) + sys.exit(1) + + return generated_dir + + +def discover_generated_ops(generated_dir): + """Return a sorted list of operator names that were generated.""" + include_dir = generated_dir / "include" + + return sorted(header.stem for header in include_dir.glob("*.h")) + + +def show_diff(old_path, new_content, label): + """Show a unified diff between an existing file and new content.""" + if old_path.exists(): + old_lines = old_path.read_text().splitlines(keepends=True) + else: + old_lines = [] + + new_lines = new_content.splitlines(keepends=True) + diff = difflib.unified_diff( + old_lines, new_lines, fromfile=f"a/{label}", tofile=f"b/{label}" + ) + diff_str = "".join(diff) + + if diff_str: + print(diff_str) + + return bool(diff_str) + + +def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): + """Copy generated files for one operator into InfiniCore.""" + gen_header = generated_dir / "include" / f"{op_name}.h" + gen_source = generated_dir / "src" / op_name / "operator.cc" + dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h" + dst_source = INFINICORE_SRC_OPS / op_name / "operator.cc" + + if not gen_header.exists(): + print(f" Warning: generated header not found: {gen_header}", file=sys.stderr) + + return False + + if not gen_source.exists(): + print(f" Warning: generated source not found: {gen_source}", file=sys.stderr) + + return False + + new_header = gen_header.read_text() + new_source = gen_source.read_text() + header_changed = False + source_changed = False + + if verbose or dry_run: + header_label = f"include/infiniop/ops/{op_name}.h" + source_label = f"src/infiniop/ops/{op_name}/operator.cc" + header_changed = show_diff(dst_header, new_header, header_label) + source_changed = show_diff(dst_source, new_source, source_label) + + if dry_run: + if not header_changed and not source_changed: + print(f" {op_name}: no changes") + + return header_changed or source_changed + + # Ensure destination directories exist. + dst_header.parent.mkdir(parents=True, exist_ok=True) + dst_source.parent.mkdir(parents=True, exist_ok=True) + + shutil.copy2(gen_header, dst_header) + shutil.copy2(gen_source, dst_source) + print(f" {op_name}: synced") + + return True + + +def verify_compilation(op_name, infiniops_root): + """Syntax-check the replaced `operator.cc` compiles with the right include paths.""" + source = INFINICORE_SRC_OPS / op_name / "operator.cc" + cmd = [ + "g++", "-std=c++17", "-fsyntax-only", + f"-I{INFINICORE_ROOT / 'include'}", + f"-I{infiniops_root / 'src'}", + str(source), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + print(f" {op_name}: COMPILE FAILED", file=sys.stderr) + + if result.stderr: + # Show the first few lines of the error. + for line in result.stderr.splitlines()[:10]: + print(f" {line}", file=sys.stderr) + + return False + + print(f" {op_name}: compile OK") + + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Sync InfiniOps operator wrappers into InfiniCore.", + ) + parser.add_argument( + "infiniops_path", + type=pathlib.Path, + help="Path to the InfiniOps project root.", + ) + parser.add_argument( + "--devices", + nargs="+", + default=["cpu"], + help="Devices to generate for (default: cpu).", + ) + parser.add_argument( + "--ops", + nargs="+", + default=None, + help="Only sync specific operators (default: all generated).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show diffs without modifying files.", + ) + parser.add_argument( + "--verify", + action="store_true", + help="Syntax-check each replaced file after syncing.", + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Show diffs even when not in dry-run mode.", + ) + args = parser.parse_args() + + infiniops_root = args.infiniops_path.resolve() + + if not (infiniops_root / "scripts" / "generate_wrappers.py").exists(): + print( + f"Error: {infiniops_root} does not look like an InfiniOps project.", + file=sys.stderr, + ) + sys.exit(1) + + # Step 1: Run InfiniOps generator. + print("=== Generating wrappers ===") + generated_dir = run_generator(infiniops_root, args.devices) + + # Step 2: Discover what was generated. + all_ops = discover_generated_ops(generated_dir) + ops_to_sync = args.ops if args.ops else all_ops + + skipped = [op for op in ops_to_sync if op not in all_ops] + + if skipped: + print( + f"Warning: requested ops not found in generated output: {skipped}", + file=sys.stderr, + ) + ops_to_sync = [op for op in ops_to_sync if op in all_ops] + + if not ops_to_sync: + print("Nothing to sync.") + + return + + # Step 3: Sync files. + action = "Previewing" if args.dry_run else "Syncing" + print(f"\n=== {action} {len(ops_to_sync)} operator(s): {', '.join(ops_to_sync)} ===") + + synced = [] + + for op_name in ops_to_sync: + changed = sync_operator( + op_name, generated_dir, dry_run=args.dry_run, verbose=args.verbose, + ) + + if changed and not args.dry_run: + synced.append(op_name) + + if args.dry_run: + return + + # Step 4: Verify compilation. + if args.verify and synced: + print("\n=== Verifying compilation ===") + failures = [] + + for op_name in synced: + if not verify_compilation(op_name, infiniops_root): + failures.append(op_name) + + if failures: + print( + f"\nCompilation failed for: {', '.join(failures)}", file=sys.stderr, + ) + sys.exit(1) + + print(f"\nDone. Synced {len(synced)} operator(s).") + + +if __name__ == "__main__": + main() From b2ed43b194ef6f2aa00b1dd505a9e9712ab4cf59 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 9 Apr 2026 08:44:01 +0000 Subject: [PATCH 02/10] refactor: output synced operator files as \`.cu\` in \`sync_infiniops.py\` InfiniOps NVIDIA device implementations include \`.cuh\` headers with CUDA syntax, so the generated operator files must be compiled with \`nvcc\`. Change the sync script to output \`.cu\` files and remove old \`.cc\` files to prevent duplicate definitions. --- scripts/sync_infiniops.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py index f54dd8443..0c77f8190 100644 --- a/scripts/sync_infiniops.py +++ b/scripts/sync_infiniops.py @@ -78,7 +78,11 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): gen_header = generated_dir / "include" / f"{op_name}.h" gen_source = generated_dir / "src" / op_name / "operator.cc" dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h" - dst_source = INFINICORE_SRC_OPS / op_name / "operator.cc" + # Use .cu extension so NVIDIA builds compile with nvcc (InfiniOps NVIDIA + # headers include .cuh files that require CUDA compilation). + dst_source = INFINICORE_SRC_OPS / op_name / "operator.cu" + # Remove old .cc to prevent duplicate definitions. + old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc" if not gen_header.exists(): print(f" Warning: generated header not found: {gen_header}", file=sys.stderr) @@ -97,7 +101,7 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): if verbose or dry_run: header_label = f"include/infiniop/ops/{op_name}.h" - source_label = f"src/infiniop/ops/{op_name}/operator.cc" + source_label = f"src/infiniop/ops/{op_name}/operator.cu" header_changed = show_diff(dst_header, new_header, header_label) source_changed = show_diff(dst_source, new_source, source_label) @@ -113,14 +117,18 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): shutil.copy2(gen_header, dst_header) shutil.copy2(gen_source, dst_source) + + if old_cc.exists(): + old_cc.unlink() + print(f" {op_name}: synced") return True def verify_compilation(op_name, infiniops_root): - """Syntax-check the replaced `operator.cc` compiles with the right include paths.""" - source = INFINICORE_SRC_OPS / op_name / "operator.cc" + """Syntax-check the replaced `operator.cu` compiles with the right include paths.""" + source = INFINICORE_SRC_OPS / op_name / "operator.cu" cmd = [ "g++", "-std=c++17", "-fsyntax-only", f"-I{INFINICORE_ROOT / 'include'}", From 4d56caa392f99441d381b2fa8499065c35198435 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 9 Apr 2026 08:44:55 +0000 Subject: [PATCH 03/10] build: add InfiniOps integration to xmake build system - Add \`infiniops\` option for specifying the InfiniOps project root - Add InfiniOps include path and source files to the \`infiniop\` target - Compile \`.cu\` operator files with \`nvcc\` on NVIDIA builds, or as plain C++ on non-NVIDIA builds (CUDA includes are \`#ifdef\`-guarded) - Suppress \`-Wunused-but-set-variable\` for NVIDIA target --- xmake.lua | 16 ++++++++++++++++ xmake/nvidia.lua | 8 +++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/xmake.lua b/xmake.lua index 8f32bf7cc..fb11ada86 100644 --- a/xmake.lua +++ b/xmake.lua @@ -269,6 +269,13 @@ if has_config("ccl") then add_defines("ENABLE_CCL") end +-- InfiniOps +option("infiniops") + set_default("") + set_showmenu(true) + set_description("Path to InfiniOps project root. If set, operator dispatch files are generated from InfiniOps.") +option_end() + target("infini-utils") set_kind("static") on_install(function (target) end) @@ -383,8 +390,17 @@ target("infiniop") add_deps("infiniop-hygon") end set_languages("cxx17") + if get_config("infiniops") and get_config("infiniops") ~= "" then + add_includedirs(get_config("infiniops") .. "/src") + add_files(get_config("infiniops") .. "/src/*.cc") + end add_files("src/infiniop/devices/handle.cc") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc") + if not has_config("nv-gpu") then + -- On non-NVIDIA builds, compile InfiniOps-synced .cu files as plain C++ + -- (CUDA includes are guarded by #ifdef ENABLE_NVIDIA_API). + add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}}) + end add_files("src/infiniop/*.cc") set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index 602fb190d..c3ef33388 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -61,7 +61,7 @@ target("infiniop-nvidia") end end - add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations", "-Xcompiler=-Wno-error=unused-function") + add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations", "-Xcompiler=-Wno-error=unused-function", "-Xcompiler=-Wno-error=unused-but-set-variable") local arch_opt = get_config("cuda_arch") if arch_opt and type(arch_opt) == "string" then @@ -75,7 +75,13 @@ target("infiniop-nvidia") end set_languages("cxx17") + if get_config("infiniops") and get_config("infiniops") ~= "" then + add_includedirs(get_config("infiniops") .. "/src") + end add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu") + -- InfiniOps-synced operator files use .cu extension because their NVIDIA + -- includes contain CUDA syntax (.cuh headers). + add_files("../src/infiniop/ops/*/operator.cu") if has_config("ninetoothed") then add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp") From f4cad2109d78153cb68a82acc3347ee262cd0393 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Fri, 10 Apr 2026 02:25:04 +0000 Subject: [PATCH 04/10] fix: resolve xmake build integration for InfiniOps operator sync Move the InfiniOps sync from `before_build` to `on_load` in the `infiniop-nvidia` target so that generated `.cu` files exist before xmake resolves file lists. The sync now stubs the original `operator.cc` with a comment instead of deleting it, preventing duplicate symbols while keeping the glob pattern `src/infiniop/ops/*/operator.cc` valid for non-synced operators. --- scripts/sync_infiniops.py | 14 +++++++++----- xmake.lua | 11 ++++++++++- xmake/nvidia.lua | 23 +++++++++++++++++------ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py index 0c77f8190..1e29c2622 100644 --- a/scripts/sync_infiniops.py +++ b/scripts/sync_infiniops.py @@ -78,11 +78,9 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): gen_header = generated_dir / "include" / f"{op_name}.h" gen_source = generated_dir / "src" / op_name / "operator.cc" dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h" - # Use .cu extension so NVIDIA builds compile with nvcc (InfiniOps NVIDIA - # headers include .cuh files that require CUDA compilation). + # Use .cu extension so nvcc compiles these files (InfiniOps NVIDIA + # headers include .cuh files with CUDA syntax). dst_source = INFINICORE_SRC_OPS / op_name / "operator.cu" - # Remove old .cc to prevent duplicate definitions. - old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc" if not gen_header.exists(): print(f" Warning: generated header not found: {gen_header}", file=sys.stderr) @@ -118,8 +116,14 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): shutil.copy2(gen_header, dst_header) shutil.copy2(gen_source, dst_source) + # Replace the original operator.cc with an empty stub to prevent + # duplicate symbol definitions (the .cu file provides all symbols). + old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc" + if old_cc.exists(): - old_cc.unlink() + old_cc.write_text( + "// This operator is provided by InfiniOps (see operator.cu).\n" + ) print(f" {op_name}: synced") diff --git a/xmake.lua b/xmake.lua index fb11ada86..77087dd69 100644 --- a/xmake.lua +++ b/xmake.lua @@ -398,11 +398,20 @@ target("infiniop") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc") if not has_config("nv-gpu") then -- On non-NVIDIA builds, compile InfiniOps-synced .cu files as plain C++ - -- (CUDA includes are guarded by #ifdef ENABLE_NVIDIA_API). + -- (NVIDIA includes are guarded by #ifdef ENABLE_NVIDIA_API). add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}}) end add_files("src/infiniop/*.cc") + before_build(function (target) + import("core.project.config") + local infiniops_path = config.get("infiniops") + if infiniops_path and infiniops_path ~= "" and not has_config("nv-gpu") then + -- CPU-only build: run sync here (NVIDIA builds sync via infiniop-nvidia). + os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", "cpu"}) + end + end) + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) add_installfiles("include/infiniop/(**/*.h)", {prefixdir = "include/infiniop"}) add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"}) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index c3ef33388..b3f3f93dd 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -38,6 +38,23 @@ target("infiniop-nvidia") target:add("linkdirs", path.directory(path.directory(nvcc_path)) .. "/lib64/stubs") target:add("links", "cuda") end + + -- InfiniOps integration: run sync to generate operator .cu files, + -- then add them for compilation. + import("core.project.config") + local infiniops_path = config.get("infiniops") + if infiniops_path and infiniops_path ~= "" then + target:add("includedirs", infiniops_path .. "/src") + local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu")) + if #cu_files == 0 then + local devices = {"cpu", "nvidia"} + os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", table.unpack(devices)}) + cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu")) + end + for _, f in ipairs(cu_files) do + target:add("files", f) + end + end end) if is_plat("windows") then @@ -75,13 +92,7 @@ target("infiniop-nvidia") end set_languages("cxx17") - if get_config("infiniops") and get_config("infiniops") ~= "" then - add_includedirs(get_config("infiniops") .. "/src") - end add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu") - -- InfiniOps-synced operator files use .cu extension because their NVIDIA - -- includes contain CUDA syntax (.cuh headers). - add_files("../src/infiniop/ops/*/operator.cu") if has_config("ninetoothed") then add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp") From ded42da40c6752eff893ddc9f0f06ba96d604dbb Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Fri, 10 Apr 2026 09:03:30 +0000 Subject: [PATCH 05/10] fix: add InfiniOps generated include path and shared library linking --- xmake.lua | 5 ++++- xmake/nvidia.lua | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/xmake.lua b/xmake.lua index 77087dd69..58819f5e4 100644 --- a/xmake.lua +++ b/xmake.lua @@ -392,7 +392,10 @@ target("infiniop") set_languages("cxx17") if get_config("infiniops") and get_config("infiniops") ~= "" then add_includedirs(get_config("infiniops") .. "/src") - add_files(get_config("infiniops") .. "/src/*.cc") + add_includedirs(get_config("infiniops") .. "/generated/include") + add_linkdirs(get_config("infiniops") .. "/build/src") + add_links("infiniops") + add_rpathdirs(get_config("infiniops") .. "/build/src") end add_files("src/infiniop/devices/handle.cc") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc") diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index b3f3f93dd..38ec53c4f 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -45,6 +45,7 @@ target("infiniop-nvidia") local infiniops_path = config.get("infiniops") if infiniops_path and infiniops_path ~= "" then target:add("includedirs", infiniops_path .. "/src") + target:add("includedirs", infiniops_path .. "/generated/include") local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu")) if #cu_files == 0 then local devices = {"cpu", "nvidia"} From 75efed4acb155be8672b5a379ba03e25ad13700c Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Fri, 10 Apr 2026 09:28:44 +0000 Subject: [PATCH 06/10] style: use Markdown backtick-fencing for identifiers in comments --- scripts/sync_infiniops.py | 10 +++++----- xmake.lua | 6 +++--- xmake/nvidia.lua | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py index 1e29c2622..af228e117 100644 --- a/scripts/sync_infiniops.py +++ b/scripts/sync_infiniops.py @@ -78,8 +78,8 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): gen_header = generated_dir / "include" / f"{op_name}.h" gen_source = generated_dir / "src" / op_name / "operator.cc" dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h" - # Use .cu extension so nvcc compiles these files (InfiniOps NVIDIA - # headers include .cuh files with CUDA syntax). + # Use `.cu` extension so `nvcc` compiles these files (InfiniOps NVIDIA + # headers include `.cuh` files with CUDA syntax). dst_source = INFINICORE_SRC_OPS / op_name / "operator.cu" if not gen_header.exists(): @@ -116,13 +116,13 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False): shutil.copy2(gen_header, dst_header) shutil.copy2(gen_source, dst_source) - # Replace the original operator.cc with an empty stub to prevent - # duplicate symbol definitions (the .cu file provides all symbols). + # Replace the original `operator.cc` with an empty stub to prevent + # duplicate symbol definitions (the `.cu` file provides all symbols). old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc" if old_cc.exists(): old_cc.write_text( - "// This operator is provided by InfiniOps (see operator.cu).\n" + "// This operator is provided by InfiniOps (see `operator.cu`).\n" ) print(f" {op_name}: synced") diff --git a/xmake.lua b/xmake.lua index 58819f5e4..0f7f84307 100644 --- a/xmake.lua +++ b/xmake.lua @@ -400,8 +400,8 @@ target("infiniop") add_files("src/infiniop/devices/handle.cc") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc") if not has_config("nv-gpu") then - -- On non-NVIDIA builds, compile InfiniOps-synced .cu files as plain C++ - -- (NVIDIA includes are guarded by #ifdef ENABLE_NVIDIA_API). + -- On non-NVIDIA builds, compile InfiniOps-synced `.cu` files as plain C++ + -- (NVIDIA includes are guarded by `#ifdef ENABLE_NVIDIA_API`). add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}}) end add_files("src/infiniop/*.cc") @@ -410,7 +410,7 @@ target("infiniop") import("core.project.config") local infiniops_path = config.get("infiniops") if infiniops_path and infiniops_path ~= "" and not has_config("nv-gpu") then - -- CPU-only build: run sync here (NVIDIA builds sync via infiniop-nvidia). + -- CPU-only build: run sync here (NVIDIA builds sync via `infiniop-nvidia`). os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", "cpu"}) end end) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index 38ec53c4f..3f7d62d04 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -39,7 +39,7 @@ target("infiniop-nvidia") target:add("links", "cuda") end - -- InfiniOps integration: run sync to generate operator .cu files, + -- InfiniOps integration: run sync to generate operator `.cu` files, -- then add them for compilation. import("core.project.config") local infiniops_path = config.get("infiniops") From 5e9e0a2a535adf359f1bc06da0c094ce5376c0d5 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Fri, 10 Apr 2026 09:32:10 +0000 Subject: [PATCH 07/10] style: apply `ruff format` to `sync_infiniops.py` --- scripts/sync_infiniops.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py index af228e117..23c632681 100644 --- a/scripts/sync_infiniops.py +++ b/scripts/sync_infiniops.py @@ -41,7 +41,10 @@ def run_generator(infiniops_root, devices): generated_dir = infiniops_root / "generated" if not generated_dir.exists(): - print(f"Error: expected output directory {generated_dir} not found", file=sys.stderr) + print( + f"Error: expected output directory {generated_dir} not found", + file=sys.stderr, + ) sys.exit(1) return generated_dir @@ -134,7 +137,9 @@ def verify_compilation(op_name, infiniops_root): """Syntax-check the replaced `operator.cu` compiles with the right include paths.""" source = INFINICORE_SRC_OPS / op_name / "operator.cu" cmd = [ - "g++", "-std=c++17", "-fsyntax-only", + "g++", + "-std=c++17", + "-fsyntax-only", f"-I{INFINICORE_ROOT / 'include'}", f"-I{infiniops_root / 'src'}", str(source), @@ -188,7 +193,8 @@ def main(): help="Syntax-check each replaced file after syncing.", ) parser.add_argument( - "--verbose", "-v", + "--verbose", + "-v", action="store_true", help="Show diffs even when not in dry-run mode.", ) @@ -227,13 +233,18 @@ def main(): # Step 3: Sync files. action = "Previewing" if args.dry_run else "Syncing" - print(f"\n=== {action} {len(ops_to_sync)} operator(s): {', '.join(ops_to_sync)} ===") + print( + f"\n=== {action} {len(ops_to_sync)} operator(s): {', '.join(ops_to_sync)} ===" + ) synced = [] for op_name in ops_to_sync: changed = sync_operator( - op_name, generated_dir, dry_run=args.dry_run, verbose=args.verbose, + op_name, + generated_dir, + dry_run=args.dry_run, + verbose=args.verbose, ) if changed and not args.dry_run: @@ -253,7 +264,8 @@ def main(): if failures: print( - f"\nCompilation failed for: {', '.join(failures)}", file=sys.stderr, + f"\nCompilation failed for: {', '.join(failures)}", + file=sys.stderr, ) sys.exit(1) From c3e0c690cee16c56382c4a13fee2b098b1b12661 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Wed, 15 Apr 2026 08:07:58 +0000 Subject: [PATCH 08/10] fix: use `add_shflags` to force-link `libinfiniops.so` on shared library targets The `infiniop` target is `set_kind("shared")`, so xmake ignores `add_ldflags` during linking. Switch to `add_shflags` with `--no-as-needed` so the GNU linker keeps `libinfiniops.so` in the `NEEDED` list even when no direct symbol references exist in `infiniop`'s own object files. --- xmake.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmake.lua b/xmake.lua index 0f7f84307..4b9a6e3eb 100644 --- a/xmake.lua +++ b/xmake.lua @@ -394,7 +394,7 @@ target("infiniop") add_includedirs(get_config("infiniops") .. "/src") add_includedirs(get_config("infiniops") .. "/generated/include") add_linkdirs(get_config("infiniops") .. "/build/src") - add_links("infiniops") + add_shflags("-Wl,--no-as-needed,-linfiniops,--as-needed", {force = true}) add_rpathdirs(get_config("infiniops") .. "/build/src") end add_files("src/infiniop/devices/handle.cc") From 638dfcbd320690f1ff12658917b766dca17322d4 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 16 Apr 2026 07:31:52 +0000 Subject: [PATCH 09/10] fix: make NVIDIA CUDA targets shared to prevent segfault Change `infiniop-nvidia`, `infinirt-nvidia`, `infiniccl-nvidia` from static to shared libraries so that `nvcc` performs proper CUDA device linking within each `.so`. When these were static archives, `g++` linked them into downstream shared libraries without device linking, corrupting `.nv_fatbin` registration and causing segfaults in `__cudaRegisterLinkedBinary` during `dlopen`. Also replace no-op `on_install` with proper `set_installdir` for all four NVIDIA targets (including `flash-attn-nvidia`). --- xmake/nvidia.lua | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index 3f7d62d04..06ef1cb72 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -14,9 +14,9 @@ local FLASH_ATTN_ROOT = get_config("flash-attn") local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini") target("infiniop-nvidia") - set_kind("static") + set_kind("shared") add_deps("infini-utils") - on_install(function (target) end) + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) set_policy("build.cuda.devlink", true) set_toolchains("cuda") @@ -101,9 +101,9 @@ target("infiniop-nvidia") target_end() target("infinirt-nvidia") - set_kind("static") + set_kind("shared") add_deps("infini-utils") - on_install(function (target) end) + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) set_policy("build.cuda.devlink", true) set_toolchains("cuda") @@ -124,9 +124,9 @@ target("infinirt-nvidia") target_end() target("infiniccl-nvidia") - set_kind("static") + set_kind("shared") add_deps("infinirt") - on_install(function (target) end) + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) if has_config("ccl") then set_policy("build.cuda.devlink", true) set_toolchains("cuda") @@ -202,6 +202,6 @@ target("flash-attn-nvidia") on_build(function (target) end) end - on_install(function (target) end) + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) target_end() From 989fd3e5f583877c15242f98df36c4cfd38f369d Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 4 Jun 2026 02:59:57 +0000 Subject: [PATCH 10/10] feat: route infiniop gemm through InfiniOps --- .gitmodules | 3 + src/infiniop/ops/gemm/operator.cc | 251 +----------------------------- src/infiniop/ops/gemm/operator.cu | 192 +++++++++++++++++++++++ submodules/InfiniOps | 1 + xmake.lua | 50 ++++-- xmake/nvidia.lua | 28 +++- 6 files changed, 264 insertions(+), 261 deletions(-) create mode 100644 src/infiniop/ops/gemm/operator.cu create mode 160000 submodules/InfiniOps diff --git a/.gitmodules b/.gitmodules index bca919479..535074641 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,6 @@ path = third_party/nlohmann_json url = https://github.com/nlohmann/json.git branch = master +[submodule "submodules/InfiniOps"] + path = submodules/InfiniOps + url = git@github.com:InfiniTensor/InfiniOps.git diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc index ac4e01e83..1d03dca67 100644 --- a/src/infiniop/ops/gemm/operator.cc +++ b/src/infiniop/ops/gemm/operator.cc @@ -1,250 +1 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/gemm.h" - -#ifdef ENABLE_CPU_API -#include "cpu/gemm_cpu.h" -#endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API) -#include "nvidia/gemm_nvidia.cuh" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/gemm_bang.h" -#endif -#ifdef ENABLE_ASCEND_API -#include "ascend/gemm_ascend.h" -#endif -#ifdef ENABLE_METAX_API -#include "metax/gemm_metax.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/gemm_moore.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/gemm_kunlun.h" -#endif - -__INFINI_C infiniStatus_t infiniopCreateGemmDescriptor( - infiniopHandle_t handle, - infiniopGemmDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::gemm::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - a_desc, \ - b_desc) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - CREATE(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - CREATE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__INFINI_C infiniStatus_t -infiniopGetGemmWorkspaceSize( - infiniopGemmDescriptor_t desc, - size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - GET(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - GET(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef GET -} - -__INFINI_C infiniStatus_t infiniopGemm( - infiniopGemmDescriptor_t desc, - void *workspace, size_t workspace_size, - void *c, - const void *a, - const void *b, - float alpha, - float beta, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, \ - c, beta, \ - a, b, alpha, \ - stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - CALCULATE(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - CALCULATE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__INFINI_C infiniStatus_t -infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_ALI_API - DELETE(INFINI_DEVICE_ALI, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_HYGON_API - DELETE(INFINI_DEVICE_HYGON, nvidia); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - DELETE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +// This operator is provided by InfiniOps (see `operator.cu`). diff --git a/src/infiniop/ops/gemm/operator.cu b/src/infiniop/ops/gemm/operator.cu new file mode 100644 index 000000000..774da314f --- /dev/null +++ b/src/infiniop/ops/gemm/operator.cu @@ -0,0 +1,192 @@ +#include "../../handle.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/gemm.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace { + +infini::ops::DataType dataTypeFromInfiniDtype(infiniDtype_t dtype) { + switch (dtype) { + case INFINI_DTYPE_I8: + return infini::ops::DataType::kInt8; + case INFINI_DTYPE_I16: + return infini::ops::DataType::kInt16; + case INFINI_DTYPE_I32: + return infini::ops::DataType::kInt32; + case INFINI_DTYPE_I64: + return infini::ops::DataType::kInt64; + case INFINI_DTYPE_U8: + return infini::ops::DataType::kUInt8; + case INFINI_DTYPE_U16: + return infini::ops::DataType::kUInt16; + case INFINI_DTYPE_U32: + return infini::ops::DataType::kUInt32; + case INFINI_DTYPE_U64: + return infini::ops::DataType::kUInt64; + case INFINI_DTYPE_F16: + return infini::ops::DataType::kFloat16; + case INFINI_DTYPE_BF16: + return infini::ops::DataType::kBFloat16; + case INFINI_DTYPE_F32: + return infini::ops::DataType::kFloat32; + case INFINI_DTYPE_F64: + return infini::ops::DataType::kFloat64; + default: + return infini::ops::DataType::kFloat32; + } +} + +infini::ops::Device::Type deviceTypeFromInfiniDevice(infiniDevice_t device) { + switch (device) { + case INFINI_DEVICE_CPU: + return infini::ops::Device::Type::kCpu; + case INFINI_DEVICE_NVIDIA: + return infini::ops::Device::Type::kNvidia; + case INFINI_DEVICE_CAMBRICON: + return infini::ops::Device::Type::kCambricon; + case INFINI_DEVICE_ASCEND: + return infini::ops::Device::Type::kAscend; + case INFINI_DEVICE_METAX: + return infini::ops::Device::Type::kMetax; + case INFINI_DEVICE_MOORE: + return infini::ops::Device::Type::kMoore; + case INFINI_DEVICE_ILUVATAR: + return infini::ops::Device::Type::kIluvatar; + case INFINI_DEVICE_KUNLUN: + return infini::ops::Device::Type::kKunlun; + case INFINI_DEVICE_HYGON: + return infini::ops::Device::Type::kHygon; + case INFINI_DEVICE_QY: + return infini::ops::Device::Type::kQy; + default: + return infini::ops::Device::Type::kCpu; + } +} + +struct TensorMeta { + std::vector shape; + std::vector strides; + infini::ops::DataType dtype; +}; + +TensorMeta makeTensorMeta(infiniopTensorDescriptor_t desc) { + return TensorMeta{desc->shape(), desc->strides(), dataTypeFromInfiniDtype(desc->dtype())}; +} + +class InfiniOpsGemmDescriptor final : public InfiniopDescriptor { +public: + InfiniOpsGemmDescriptor(infiniopHandle_t handle, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) + : InfiniopDescriptor{handle->device, handle->device_id}, + device(deviceTypeFromInfiniDevice(handle->device), handle->device_id), + c(makeTensorMeta(c_desc)), + a(makeTensorMeta(a_desc)), + b(makeTensorMeta(b_desc)) {} + + infini::ops::Tensor tensor(const TensorMeta &meta, void *data) const { + return infini::ops::Tensor(data, meta.shape, meta.dtype, device, meta.strides); + } + + infini::ops::Tensor tensor(const TensorMeta &meta, const void *data) const { + return tensor(meta, const_cast(data)); + } + + infini::ops::Device device; + TensorMeta c; + TensorMeta a; + TensorMeta b; +}; + +bool isExplicitTorchGemmDevice(infiniDevice_t device) { + switch (device) { + case INFINI_DEVICE_CPU: + case INFINI_DEVICE_NVIDIA: + case INFINI_DEVICE_CAMBRICON: + case INFINI_DEVICE_ASCEND: + case INFINI_DEVICE_METAX: + case INFINI_DEVICE_MOORE: + case INFINI_DEVICE_ILUVATAR: + case INFINI_DEVICE_KUNLUN: + case INFINI_DEVICE_HYGON: + case INFINI_DEVICE_QY: + return true; + default: + return false; + } +} + +} // namespace + +__INFINI_C infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle, + infiniopGemmDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + const infiniopTensorDescriptor_t a, + const infiniopTensorDescriptor_t b) { + if (!isExplicitTorchGemmDevice(handle->device)) { + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + *desc_ptr = new InfiniOpsGemmDescriptor(handle, c, a, b); + return INFINI_STATUS_SUCCESS; +} + +__INFINI_C infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size) { + (void)desc; + *size = 0; + return INFINI_STATUS_SUCCESS; +} + +__INFINI_C infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + float alpha, + float beta, + void *stream) { + auto *gemm_desc = reinterpret_cast(desc); + if (!isExplicitTorchGemmDevice(gemm_desc->device_type)) { + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + infini::ops::Handle handle; + handle.set_stream(stream); + handle.set_workspace(workspace); + handle.set_workspace_size_in_bytes(workspace_size); + + infini::ops::Config config; + config.set_implementation_index(2); + + infini::ops::Operator::Call( + handle, + config, + gemm_desc->tensor(gemm_desc->a, a), + gemm_desc->tensor(gemm_desc->b, b), + std::optional(alpha), + std::optional(beta), + std::optional{}, + std::optional{}, + gemm_desc->tensor(gemm_desc->c, c)); + + return INFINI_STATUS_SUCCESS; +} + +__INFINI_C infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { + delete reinterpret_cast(desc); + return INFINI_STATUS_SUCCESS; +} diff --git a/submodules/InfiniOps b/submodules/InfiniOps new file mode 160000 index 000000000..9444f9c3d --- /dev/null +++ b/submodules/InfiniOps @@ -0,0 +1 @@ +Subproject commit 9444f9c3d2b98084fc150252531239137ad2519b diff --git a/xmake.lua b/xmake.lua index 4b9a6e3eb..ff18e7674 100644 --- a/xmake.lua +++ b/xmake.lua @@ -1,6 +1,5 @@ add_rules("mode.debug", "mode.release") add_requires("boost", {configs = {stacktrace = true}}) -add_requires("pybind11") -- Define color codes local GREEN = '\27[0;32m' @@ -54,6 +53,18 @@ option_end() if has_config("nv-gpu") then add_defines("ENABLE_NVIDIA_API") + local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") + if CUDA_ROOT == nil then + if os.isdir("/usr/local/cuda") then + CUDA_ROOT = "/usr/local/cuda" + elseif os.isdir("/usr/local/cuda-13.0") then + CUDA_ROOT = "/usr/local/cuda-13.0" + end + end + if CUDA_ROOT ~= nil then + add_includedirs(CUDA_ROOT .. "/include") + add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include") + end includes("xmake/nvidia.lua") end @@ -273,9 +284,21 @@ end option("infiniops") set_default("") set_showmenu(true) - set_description("Path to InfiniOps project root. If set, operator dispatch files are generated from InfiniOps.") + set_description("Path to InfiniOps project root. Defaults to submodules/InfiniOps when present.") option_end() +function get_infiniops_path() + local configured = get_config("infiniops") + if configured and configured ~= "" then + return path.absolute(configured) + end + + local bundled = path.join(os.projectdir(), "submodules", "InfiniOps") + if os.isdir(bundled) then + return bundled + end +end + target("infini-utils") set_kind("static") on_install(function (target) end) @@ -390,26 +413,35 @@ target("infiniop") add_deps("infiniop-hygon") end set_languages("cxx17") - if get_config("infiniops") and get_config("infiniops") ~= "" then - add_includedirs(get_config("infiniops") .. "/src") - add_includedirs(get_config("infiniops") .. "/generated/include") - add_linkdirs(get_config("infiniops") .. "/build/src") + local infiniops_path = get_infiniops_path() + if infiniops_path then + add_includedirs(infiniops_path .. "/src") + add_includedirs(infiniops_path .. "/generated/include") + add_linkdirs(infiniops_path .. "/build/src") add_shflags("-Wl,--no-as-needed,-linfiniops,--as-needed", {force = true}) - add_rpathdirs(get_config("infiniops") .. "/build/src") + add_rpathdirs(infiniops_path .. "/build/src") end add_files("src/infiniop/devices/handle.cc") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc") if not has_config("nv-gpu") then -- On non-NVIDIA builds, compile InfiniOps-synced `.cu` files as plain C++ -- (NVIDIA includes are guarded by `#ifdef ENABLE_NVIDIA_API`). - add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}}) + add_files("src/infiniop/ops/gemm/operator.cu", {force = {languages = "cxx17"}}) end add_files("src/infiniop/*.cc") before_build(function (target) import("core.project.config") local infiniops_path = config.get("infiniops") - if infiniops_path and infiniops_path ~= "" and not has_config("nv-gpu") then + if infiniops_path and infiniops_path ~= "" then + infiniops_path = path.absolute(infiniops_path) + else + local bundled = path.join(os.projectdir(), "submodules", "InfiniOps") + if os.isdir(bundled) then + infiniops_path = bundled + end + end + if infiniops_path and not has_config("nv-gpu") then -- CPU-only build: run sync here (NVIDIA builds sync via `infiniop-nvidia`). os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", "cpu"}) end diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index 06ef1cb72..7381471bf 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -1,3 +1,16 @@ +local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") +if CUDA_ROOT == nil then + if os.isdir("/usr/local/cuda") then + CUDA_ROOT = "/usr/local/cuda" + elseif os.isdir("/usr/local/cuda-13.0") then + CUDA_ROOT = "/usr/local/cuda-13.0" + end +end +if CUDA_ROOT ~= nil then + add_includedirs(CUDA_ROOT .. "/include") + add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include") +end + local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH") if CUDNN_ROOT ~= nil then add_includedirs(CUDNN_ROOT .. "/include") @@ -44,13 +57,24 @@ target("infiniop-nvidia") import("core.project.config") local infiniops_path = config.get("infiniops") if infiniops_path and infiniops_path ~= "" then + infiniops_path = path.absolute(infiniops_path) + else + local bundled = path.join(os.projectdir(), "submodules", "InfiniOps") + if os.isdir(bundled) then + infiniops_path = bundled + end + end + if infiniops_path then target:add("includedirs", infiniops_path .. "/src") target:add("includedirs", infiniops_path .. "/generated/include") - local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu")) + target:add("linkdirs", infiniops_path .. "/build/src") + target:add("shflags", "-Wl,--no-as-needed,-linfiniops,--as-needed", {force = true}) + target:add("rpathdirs", infiniops_path .. "/build/src") + local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/gemm/operator.cu")) if #cu_files == 0 then local devices = {"cpu", "nvidia"} os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", table.unpack(devices)}) - cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu")) + cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/gemm/operator.cu")) end for _, f in ipairs(cu_files) do target:add("files", f)