From 42dff5ecc093b7828808ad5c33cdbc20acdec51f Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 9 Apr 2026 04:52:41 +0000
Subject: [PATCH 01/10] feat: add `sync_infiniops.py` script for syncing
 operator wrappers from InfiniOps

---
 scripts/sync_infiniops.py | 252 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 scripts/sync_infiniops.py

diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py
new file mode 100644
index 000000000..f54dd8443
--- /dev/null
+++ b/scripts/sync_infiniops.py
@@ -0,0 +1,252 @@
+"""Sync operator implementations from InfiniOps into InfiniCore.
+
+This script invokes InfiniOps's code generator to produce legacy C API wrappers,
+then copies the generated headers and source files into InfiniCore's tree,
+replacing the hand-written operator dispatch files.
+
+Usage:
+    python scripts/sync_infiniops.py /path/to/InfiniOps [--devices cpu nvidia ...]
+    python scripts/sync_infiniops.py /path/to/InfiniOps --ops gemm add
+    python scripts/sync_infiniops.py /path/to/InfiniOps --dry-run
+"""
+
+import argparse
+import difflib
+import pathlib
+import shutil
+import subprocess
+import sys
+
+INFINICORE_ROOT = pathlib.Path(__file__).resolve().parent.parent
+INFINICORE_INCLUDE_OPS = INFINICORE_ROOT / "include" / "infiniop" / "ops"
+INFINICORE_SRC_OPS = INFINICORE_ROOT / "src" / "infiniop" / "ops"
+
+
+def run_generator(infiniops_root, devices):
+    """Run InfiniOps's `generate_wrappers.py` and return the generated directory."""
+    generator = infiniops_root / "scripts" / "generate_wrappers.py"
+
+    if not generator.exists():
+        print(f"Error: generator not found at {generator}", file=sys.stderr)
+        sys.exit(1)
+
+    cmd = [sys.executable, str(generator), "--devices"] + devices
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, cwd=infiniops_root, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Error: generator failed:\n{result.stderr}", file=sys.stderr)
+        sys.exit(1)
+
+    generated_dir = infiniops_root / "generated"
+
+    if not generated_dir.exists():
+        print(f"Error: expected output directory {generated_dir} not found", file=sys.stderr)
+        sys.exit(1)
+
+    return generated_dir
+
+
+def discover_generated_ops(generated_dir):
+    """Return a sorted list of operator names that were generated."""
+    include_dir = generated_dir / "include"
+
+    return sorted(header.stem for header in include_dir.glob("*.h"))
+
+
+def show_diff(old_path, new_content, label):
+    """Show a unified diff between an existing file and new content."""
+    if old_path.exists():
+        old_lines = old_path.read_text().splitlines(keepends=True)
+    else:
+        old_lines = []
+
+    new_lines = new_content.splitlines(keepends=True)
+    diff = difflib.unified_diff(
+        old_lines, new_lines, fromfile=f"a/{label}", tofile=f"b/{label}"
+    )
+    diff_str = "".join(diff)
+
+    if diff_str:
+        print(diff_str)
+
+    return bool(diff_str)
+
+
+def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
+    """Copy generated files for one operator into InfiniCore."""
+    gen_header = generated_dir / "include" / f"{op_name}.h"
+    gen_source = generated_dir / "src" / op_name / "operator.cc"
+    dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h"
+    dst_source = INFINICORE_SRC_OPS / op_name / "operator.cc"
+
+    if not gen_header.exists():
+        print(f"  Warning: generated header not found: {gen_header}", file=sys.stderr)
+
+        return False
+
+    if not gen_source.exists():
+        print(f"  Warning: generated source not found: {gen_source}", file=sys.stderr)
+
+        return False
+
+    new_header = gen_header.read_text()
+    new_source = gen_source.read_text()
+    header_changed = False
+    source_changed = False
+
+    if verbose or dry_run:
+        header_label = f"include/infiniop/ops/{op_name}.h"
+        source_label = f"src/infiniop/ops/{op_name}/operator.cc"
+        header_changed = show_diff(dst_header, new_header, header_label)
+        source_changed = show_diff(dst_source, new_source, source_label)
+
+    if dry_run:
+        if not header_changed and not source_changed:
+            print(f"  {op_name}: no changes")
+
+        return header_changed or source_changed
+
+    # Ensure destination directories exist.
+    dst_header.parent.mkdir(parents=True, exist_ok=True)
+    dst_source.parent.mkdir(parents=True, exist_ok=True)
+
+    shutil.copy2(gen_header, dst_header)
+    shutil.copy2(gen_source, dst_source)
+    print(f"  {op_name}: synced")
+
+    return True
+
+
+def verify_compilation(op_name, infiniops_root):
+    """Syntax-check the replaced `operator.cc` compiles with the right include paths."""
+    source = INFINICORE_SRC_OPS / op_name / "operator.cc"
+    cmd = [
+        "g++", "-std=c++17", "-fsyntax-only",
+        f"-I{INFINICORE_ROOT / 'include'}",
+        f"-I{infiniops_root / 'src'}",
+        str(source),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"  {op_name}: COMPILE FAILED", file=sys.stderr)
+
+        if result.stderr:
+            # Show the first few lines of the error.
+            for line in result.stderr.splitlines()[:10]:
+                print(f"    {line}", file=sys.stderr)
+
+        return False
+
+    print(f"  {op_name}: compile OK")
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Sync InfiniOps operator wrappers into InfiniCore.",
+    )
+    parser.add_argument(
+        "infiniops_path",
+        type=pathlib.Path,
+        help="Path to the InfiniOps project root.",
+    )
+    parser.add_argument(
+        "--devices",
+        nargs="+",
+        default=["cpu"],
+        help="Devices to generate for (default: cpu).",
+    )
+    parser.add_argument(
+        "--ops",
+        nargs="+",
+        default=None,
+        help="Only sync specific operators (default: all generated).",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show diffs without modifying files.",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Syntax-check each replaced file after syncing.",
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Show diffs even when not in dry-run mode.",
+    )
+    args = parser.parse_args()
+
+    infiniops_root = args.infiniops_path.resolve()
+
+    if not (infiniops_root / "scripts" / "generate_wrappers.py").exists():
+        print(
+            f"Error: {infiniops_root} does not look like an InfiniOps project.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # Step 1: Run InfiniOps generator.
+    print("=== Generating wrappers ===")
+    generated_dir = run_generator(infiniops_root, args.devices)
+
+    # Step 2: Discover what was generated.
+    all_ops = discover_generated_ops(generated_dir)
+    ops_to_sync = args.ops if args.ops else all_ops
+
+    skipped = [op for op in ops_to_sync if op not in all_ops]
+
+    if skipped:
+        print(
+            f"Warning: requested ops not found in generated output: {skipped}",
+            file=sys.stderr,
+        )
+        ops_to_sync = [op for op in ops_to_sync if op in all_ops]
+
+    if not ops_to_sync:
+        print("Nothing to sync.")
+
+        return
+
+    # Step 3: Sync files.
+    action = "Previewing" if args.dry_run else "Syncing"
+    print(f"\n=== {action} {len(ops_to_sync)} operator(s): {', '.join(ops_to_sync)} ===")
+
+    synced = []
+
+    for op_name in ops_to_sync:
+        changed = sync_operator(
+            op_name, generated_dir, dry_run=args.dry_run, verbose=args.verbose,
+        )
+
+        if changed and not args.dry_run:
+            synced.append(op_name)
+
+    if args.dry_run:
+        return
+
+    # Step 4: Verify compilation.
+    if args.verify and synced:
+        print("\n=== Verifying compilation ===")
+        failures = []
+
+        for op_name in synced:
+            if not verify_compilation(op_name, infiniops_root):
+                failures.append(op_name)
+
+        if failures:
+            print(
+                f"\nCompilation failed for: {', '.join(failures)}", file=sys.stderr,
+            )
+            sys.exit(1)
+
+    print(f"\nDone. Synced {len(synced)} operator(s).")
+
+
+if __name__ == "__main__":
+    main()

From b2ed43b194ef6f2aa00b1dd505a9e9712ab4cf59 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 9 Apr 2026 08:44:01 +0000
Subject: [PATCH 02/10] refactor: output synced operator files as \`.cu\` in
 \`sync_infiniops.py\`

InfiniOps NVIDIA device implementations include \`.cuh\` headers with CUDA
syntax, so the generated operator files must be compiled with \`nvcc\`.
Change the sync script to output \`.cu\` files and remove old \`.cc\` files
to prevent duplicate definitions.
---
 scripts/sync_infiniops.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py
index f54dd8443..0c77f8190 100644
--- a/scripts/sync_infiniops.py
+++ b/scripts/sync_infiniops.py
@@ -78,7 +78,11 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
     gen_header = generated_dir / "include" / f"{op_name}.h"
     gen_source = generated_dir / "src" / op_name / "operator.cc"
     dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h"
-    dst_source = INFINICORE_SRC_OPS / op_name / "operator.cc"
+    # Use .cu extension so NVIDIA builds compile with nvcc (InfiniOps NVIDIA
+    # headers include .cuh files that require CUDA compilation).
+    dst_source = INFINICORE_SRC_OPS / op_name / "operator.cu"
+    # Remove old .cc to prevent duplicate definitions.
+    old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc"
 
     if not gen_header.exists():
         print(f"  Warning: generated header not found: {gen_header}", file=sys.stderr)
@@ -97,7 +101,7 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
 
     if verbose or dry_run:
         header_label = f"include/infiniop/ops/{op_name}.h"
-        source_label = f"src/infiniop/ops/{op_name}/operator.cc"
+        source_label = f"src/infiniop/ops/{op_name}/operator.cu"
         header_changed = show_diff(dst_header, new_header, header_label)
         source_changed = show_diff(dst_source, new_source, source_label)
 
@@ -113,14 +117,18 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
 
     shutil.copy2(gen_header, dst_header)
     shutil.copy2(gen_source, dst_source)
+
+    if old_cc.exists():
+        old_cc.unlink()
+
     print(f"  {op_name}: synced")
 
     return True
 
 
 def verify_compilation(op_name, infiniops_root):
-    """Syntax-check the replaced `operator.cc` compiles with the right include paths."""
-    source = INFINICORE_SRC_OPS / op_name / "operator.cc"
+    """Syntax-check the replaced `operator.cu` compiles with the right include paths."""
+    source = INFINICORE_SRC_OPS / op_name / "operator.cu"
     cmd = [
         "g++", "-std=c++17", "-fsyntax-only",
         f"-I{INFINICORE_ROOT / 'include'}",

From 4d56caa392f99441d381b2fa8499065c35198435 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 9 Apr 2026 08:44:55 +0000
Subject: [PATCH 03/10] build: add InfiniOps integration to xmake build system

- Add \`infiniops\` option for specifying the InfiniOps project root
- Add InfiniOps include path and source files to the \`infiniop\` target
- Compile \`.cu\` operator files with \`nvcc\` on NVIDIA builds, or as
  plain C++ on non-NVIDIA builds (CUDA includes are \`#ifdef\`-guarded)
- Suppress \`-Wunused-but-set-variable\` for NVIDIA target
---
 xmake.lua        | 16 ++++++++++++++++
 xmake/nvidia.lua |  8 +++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/xmake.lua b/xmake.lua
index 8f32bf7cc..fb11ada86 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -269,6 +269,13 @@ if has_config("ccl") then
     add_defines("ENABLE_CCL")
 end
 
+-- InfiniOps
+option("infiniops")
+    set_default("")
+    set_showmenu(true)
+    set_description("Path to InfiniOps project root. If set, operator dispatch files are generated from InfiniOps.")
+option_end()
+
 target("infini-utils")
     set_kind("static")
     on_install(function (target) end)
@@ -383,8 +390,17 @@ target("infiniop")
         add_deps("infiniop-hygon")
     end
     set_languages("cxx17")
+    if get_config("infiniops") and get_config("infiniops") ~= "" then
+        add_includedirs(get_config("infiniops") .. "/src")
+        add_files(get_config("infiniops") .. "/src/*.cc")
+    end
     add_files("src/infiniop/devices/handle.cc")
     add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
+    if not has_config("nv-gpu") then
+        -- On non-NVIDIA builds, compile InfiniOps-synced .cu files as plain C++
+        -- (CUDA includes are guarded by #ifdef ENABLE_NVIDIA_API).
+        add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}})
+    end
     add_files("src/infiniop/*.cc")
 
     set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index 602fb190d..c3ef33388 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -61,7 +61,7 @@ target("infiniop-nvidia")
         end
     end
 
-    add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations", "-Xcompiler=-Wno-error=unused-function")
+    add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations", "-Xcompiler=-Wno-error=unused-function", "-Xcompiler=-Wno-error=unused-but-set-variable")
 
     local arch_opt = get_config("cuda_arch")
     if arch_opt and type(arch_opt) == "string" then
@@ -75,7 +75,13 @@ target("infiniop-nvidia")
     end
 
     set_languages("cxx17")
+    if get_config("infiniops") and get_config("infiniops") ~= "" then
+        add_includedirs(get_config("infiniops") .. "/src")
+    end
     add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
+    -- InfiniOps-synced operator files use .cu extension because their NVIDIA
+    -- includes contain CUDA syntax (.cuh headers).
+    add_files("../src/infiniop/ops/*/operator.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")

From f4cad2109d78153cb68a82acc3347ee262cd0393 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Fri, 10 Apr 2026 02:25:04 +0000
Subject: [PATCH 04/10] fix: resolve xmake build integration for InfiniOps
 operator sync

Move the InfiniOps sync from `before_build` to `on_load` in the
`infiniop-nvidia` target so that generated `.cu` files exist before
xmake resolves file lists. The sync now stubs the original `operator.cc`
with a comment instead of deleting it, preventing duplicate symbols
while keeping the glob pattern `src/infiniop/ops/*/operator.cc` valid
for non-synced operators.
---
 scripts/sync_infiniops.py | 14 +++++++++-----
 xmake.lua                 | 11 ++++++++++-
 xmake/nvidia.lua          | 23 +++++++++++++++++------
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py
index 0c77f8190..1e29c2622 100644
--- a/scripts/sync_infiniops.py
+++ b/scripts/sync_infiniops.py
@@ -78,11 +78,9 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
     gen_header = generated_dir / "include" / f"{op_name}.h"
     gen_source = generated_dir / "src" / op_name / "operator.cc"
     dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h"
-    # Use .cu extension so NVIDIA builds compile with nvcc (InfiniOps NVIDIA
-    # headers include .cuh files that require CUDA compilation).
+    # Use .cu extension so nvcc compiles these files (InfiniOps NVIDIA
+    # headers include .cuh files with CUDA syntax).
     dst_source = INFINICORE_SRC_OPS / op_name / "operator.cu"
-    # Remove old .cc to prevent duplicate definitions.
-    old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc"
 
     if not gen_header.exists():
         print(f"  Warning: generated header not found: {gen_header}", file=sys.stderr)
@@ -118,8 +116,14 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
     shutil.copy2(gen_header, dst_header)
     shutil.copy2(gen_source, dst_source)
 
+    # Replace the original operator.cc with an empty stub to prevent
+    # duplicate symbol definitions (the .cu file provides all symbols).
+    old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc"
+
     if old_cc.exists():
-        old_cc.unlink()
+        old_cc.write_text(
+            "// This operator is provided by InfiniOps (see operator.cu).\n"
+        )
 
     print(f"  {op_name}: synced")
 
diff --git a/xmake.lua b/xmake.lua
index fb11ada86..77087dd69 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -398,11 +398,20 @@ target("infiniop")
     add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
     if not has_config("nv-gpu") then
         -- On non-NVIDIA builds, compile InfiniOps-synced .cu files as plain C++
-        -- (CUDA includes are guarded by #ifdef ENABLE_NVIDIA_API).
+        -- (NVIDIA includes are guarded by #ifdef ENABLE_NVIDIA_API).
         add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}})
     end
     add_files("src/infiniop/*.cc")
 
+    before_build(function (target)
+        import("core.project.config")
+        local infiniops_path = config.get("infiniops")
+        if infiniops_path and infiniops_path ~= "" and not has_config("nv-gpu") then
+            -- CPU-only build: run sync here (NVIDIA builds sync via infiniop-nvidia).
+            os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", "cpu"})
+        end
+    end)
+
     set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
     add_installfiles("include/infiniop/(**/*.h)", {prefixdir = "include/infiniop"})
     add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"})
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index c3ef33388..b3f3f93dd 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -38,6 +38,23 @@ target("infiniop-nvidia")
             target:add("linkdirs", path.directory(path.directory(nvcc_path)) .. "/lib64/stubs")
             target:add("links", "cuda")
         end
+
+        -- InfiniOps integration: run sync to generate operator .cu files,
+        -- then add them for compilation.
+        import("core.project.config")
+        local infiniops_path = config.get("infiniops")
+        if infiniops_path and infiniops_path ~= "" then
+            target:add("includedirs", infiniops_path .. "/src")
+            local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu"))
+            if #cu_files == 0 then
+                local devices = {"cpu", "nvidia"}
+                os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", table.unpack(devices)})
+                cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu"))
+            end
+            for _, f in ipairs(cu_files) do
+                target:add("files", f)
+            end
+        end
     end)
 
     if is_plat("windows") then
@@ -75,13 +92,7 @@ target("infiniop-nvidia")
     end
 
     set_languages("cxx17")
-    if get_config("infiniops") and get_config("infiniops") ~= "" then
-        add_includedirs(get_config("infiniops") .. "/src")
-    end
     add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
-    -- InfiniOps-synced operator files use .cu extension because their NVIDIA
-    -- includes contain CUDA syntax (.cuh headers).
-    add_files("../src/infiniop/ops/*/operator.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")

From ded42da40c6752eff893ddc9f0f06ba96d604dbb Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Fri, 10 Apr 2026 09:03:30 +0000
Subject: [PATCH 05/10] fix: add InfiniOps generated include path and shared
 library linking

---
 xmake.lua        | 5 ++++-
 xmake/nvidia.lua | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/xmake.lua b/xmake.lua
index 77087dd69..58819f5e4 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -392,7 +392,10 @@ target("infiniop")
     set_languages("cxx17")
     if get_config("infiniops") and get_config("infiniops") ~= "" then
         add_includedirs(get_config("infiniops") .. "/src")
-        add_files(get_config("infiniops") .. "/src/*.cc")
+        add_includedirs(get_config("infiniops") .. "/generated/include")
+        add_linkdirs(get_config("infiniops") .. "/build/src")
+        add_links("infiniops")
+        add_rpathdirs(get_config("infiniops") .. "/build/src")
     end
     add_files("src/infiniop/devices/handle.cc")
     add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index b3f3f93dd..38ec53c4f 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -45,6 +45,7 @@ target("infiniop-nvidia")
         local infiniops_path = config.get("infiniops")
         if infiniops_path and infiniops_path ~= "" then
             target:add("includedirs", infiniops_path .. "/src")
+            target:add("includedirs", infiniops_path .. "/generated/include")
             local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu"))
             if #cu_files == 0 then
                 local devices = {"cpu", "nvidia"}

From 75efed4acb155be8672b5a379ba03e25ad13700c Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Fri, 10 Apr 2026 09:28:44 +0000
Subject: [PATCH 06/10] style: use Markdown backtick-fencing for identifiers in
 comments

---
 scripts/sync_infiniops.py | 10 +++++-----
 xmake.lua                 |  6 +++---
 xmake/nvidia.lua          |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py
index 1e29c2622..af228e117 100644
--- a/scripts/sync_infiniops.py
+++ b/scripts/sync_infiniops.py
@@ -78,8 +78,8 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
     gen_header = generated_dir / "include" / f"{op_name}.h"
     gen_source = generated_dir / "src" / op_name / "operator.cc"
     dst_header = INFINICORE_INCLUDE_OPS / f"{op_name}.h"
-    # Use .cu extension so nvcc compiles these files (InfiniOps NVIDIA
-    # headers include .cuh files with CUDA syntax).
+    # Use `.cu` extension so `nvcc` compiles these files (InfiniOps NVIDIA
+    # headers include `.cuh` files with CUDA syntax).
     dst_source = INFINICORE_SRC_OPS / op_name / "operator.cu"
 
     if not gen_header.exists():
@@ -116,13 +116,13 @@ def sync_operator(op_name, generated_dir, dry_run=False, verbose=False):
     shutil.copy2(gen_header, dst_header)
     shutil.copy2(gen_source, dst_source)
 
-    # Replace the original operator.cc with an empty stub to prevent
-    # duplicate symbol definitions (the .cu file provides all symbols).
+    # Replace the original `operator.cc` with an empty stub to prevent
+    # duplicate symbol definitions (the `.cu` file provides all symbols).
     old_cc = INFINICORE_SRC_OPS / op_name / "operator.cc"
 
     if old_cc.exists():
         old_cc.write_text(
-            "// This operator is provided by InfiniOps (see operator.cu).\n"
+            "// This operator is provided by InfiniOps (see `operator.cu`).\n"
         )
 
     print(f"  {op_name}: synced")
diff --git a/xmake.lua b/xmake.lua
index 58819f5e4..0f7f84307 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -400,8 +400,8 @@ target("infiniop")
     add_files("src/infiniop/devices/handle.cc")
     add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
     if not has_config("nv-gpu") then
-        -- On non-NVIDIA builds, compile InfiniOps-synced .cu files as plain C++
-        -- (NVIDIA includes are guarded by #ifdef ENABLE_NVIDIA_API).
+        -- On non-NVIDIA builds, compile InfiniOps-synced `.cu` files as plain C++
+        -- (NVIDIA includes are guarded by `#ifdef ENABLE_NVIDIA_API`).
         add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}})
     end
     add_files("src/infiniop/*.cc")
@@ -410,7 +410,7 @@ target("infiniop")
         import("core.project.config")
         local infiniops_path = config.get("infiniops")
         if infiniops_path and infiniops_path ~= "" and not has_config("nv-gpu") then
-            -- CPU-only build: run sync here (NVIDIA builds sync via infiniop-nvidia).
+            -- CPU-only build: run sync here (NVIDIA builds sync via `infiniop-nvidia`).
             os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", "cpu"})
         end
     end)
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index 38ec53c4f..3f7d62d04 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -39,7 +39,7 @@ target("infiniop-nvidia")
             target:add("links", "cuda")
         end
 
-        -- InfiniOps integration: run sync to generate operator .cu files,
+        -- InfiniOps integration: run sync to generate operator `.cu` files,
         -- then add them for compilation.
         import("core.project.config")
         local infiniops_path = config.get("infiniops")

From 5e9e0a2a535adf359f1bc06da0c094ce5376c0d5 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Fri, 10 Apr 2026 09:32:10 +0000
Subject: [PATCH 07/10] style: apply `ruff format` to `sync_infiniops.py`

---
 scripts/sync_infiniops.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/sync_infiniops.py b/scripts/sync_infiniops.py
index af228e117..23c632681 100644
--- a/scripts/sync_infiniops.py
+++ b/scripts/sync_infiniops.py
@@ -41,7 +41,10 @@ def run_generator(infiniops_root, devices):
     generated_dir = infiniops_root / "generated"
 
     if not generated_dir.exists():
-        print(f"Error: expected output directory {generated_dir} not found", file=sys.stderr)
+        print(
+            f"Error: expected output directory {generated_dir} not found",
+            file=sys.stderr,
+        )
         sys.exit(1)
 
     return generated_dir
@@ -134,7 +137,9 @@ def verify_compilation(op_name, infiniops_root):
     """Syntax-check the replaced `operator.cu` compiles with the right include paths."""
     source = INFINICORE_SRC_OPS / op_name / "operator.cu"
     cmd = [
-        "g++", "-std=c++17", "-fsyntax-only",
+        "g++",
+        "-std=c++17",
+        "-fsyntax-only",
         f"-I{INFINICORE_ROOT / 'include'}",
         f"-I{infiniops_root / 'src'}",
         str(source),
@@ -188,7 +193,8 @@ def main():
         help="Syntax-check each replaced file after syncing.",
     )
     parser.add_argument(
-        "--verbose", "-v",
+        "--verbose",
+        "-v",
         action="store_true",
         help="Show diffs even when not in dry-run mode.",
     )
@@ -227,13 +233,18 @@ def main():
 
     # Step 3: Sync files.
     action = "Previewing" if args.dry_run else "Syncing"
-    print(f"\n=== {action} {len(ops_to_sync)} operator(s): {', '.join(ops_to_sync)} ===")
+    print(
+        f"\n=== {action} {len(ops_to_sync)} operator(s): {', '.join(ops_to_sync)} ==="
+    )
 
     synced = []
 
     for op_name in ops_to_sync:
         changed = sync_operator(
-            op_name, generated_dir, dry_run=args.dry_run, verbose=args.verbose,
+            op_name,
+            generated_dir,
+            dry_run=args.dry_run,
+            verbose=args.verbose,
         )
 
         if changed and not args.dry_run:
@@ -253,7 +264,8 @@ def main():
 
         if failures:
             print(
-                f"\nCompilation failed for: {', '.join(failures)}", file=sys.stderr,
+                f"\nCompilation failed for: {', '.join(failures)}",
+                file=sys.stderr,
             )
             sys.exit(1)
 

From c3e0c690cee16c56382c4a13fee2b098b1b12661 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Wed, 15 Apr 2026 08:07:58 +0000
Subject: [PATCH 08/10] fix: use `add_shflags` to force-link `libinfiniops.so`
 on shared library targets

The `infiniop` target is `set_kind("shared")`, so xmake ignores
`add_ldflags` during linking. Switch to `add_shflags` with
`--no-as-needed` so the GNU linker keeps `libinfiniops.so` in the
`NEEDED` list even when no direct symbol references exist in
`infiniop`'s own object files.
---
 xmake.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmake.lua b/xmake.lua
index 0f7f84307..4b9a6e3eb 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -394,7 +394,7 @@ target("infiniop")
         add_includedirs(get_config("infiniops") .. "/src")
         add_includedirs(get_config("infiniops") .. "/generated/include")
         add_linkdirs(get_config("infiniops") .. "/build/src")
-        add_links("infiniops")
+        add_shflags("-Wl,--no-as-needed,-linfiniops,--as-needed", {force = true})
         add_rpathdirs(get_config("infiniops") .. "/build/src")
     end
     add_files("src/infiniop/devices/handle.cc")

From 638dfcbd320690f1ff12658917b766dca17322d4 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 16 Apr 2026 07:31:52 +0000
Subject: [PATCH 09/10] fix: make NVIDIA CUDA targets shared to prevent
 segfault

Change `infiniop-nvidia`, `infinirt-nvidia`, `infiniccl-nvidia` from
static to shared libraries so that `nvcc` performs proper CUDA device
linking within each `.so`. When these were static archives, `g++`
linked them into downstream shared libraries without device linking,
corrupting `.nv_fatbin` registration and causing segfaults in
`__cudaRegisterLinkedBinary` during `dlopen`.

Also replace no-op `on_install` with proper `set_installdir` for all
four NVIDIA targets (including `flash-attn-nvidia`).
---
 xmake/nvidia.lua | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index 3f7d62d04..06ef1cb72 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -14,9 +14,9 @@ local FLASH_ATTN_ROOT = get_config("flash-attn")
 local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
 
 target("infiniop-nvidia")
-    set_kind("static")
+    set_kind("shared")
     add_deps("infini-utils")
-    on_install(function (target) end)
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 
     set_policy("build.cuda.devlink", true)
     set_toolchains("cuda")
@@ -101,9 +101,9 @@ target("infiniop-nvidia")
 target_end()
 
 target("infinirt-nvidia")
-    set_kind("static")
+    set_kind("shared")
     add_deps("infini-utils")
-    on_install(function (target) end)
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 
     set_policy("build.cuda.devlink", true)
     set_toolchains("cuda")
@@ -124,9 +124,9 @@ target("infinirt-nvidia")
 target_end()
 
 target("infiniccl-nvidia")
-    set_kind("static")
+    set_kind("shared")
     add_deps("infinirt")
-    on_install(function (target) end)
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
     if has_config("ccl") then
         set_policy("build.cuda.devlink", true)
         set_toolchains("cuda")
@@ -202,6 +202,6 @@ target("flash-attn-nvidia")
         on_build(function (target) end)
     end
 
-    on_install(function (target) end)
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 
 target_end()

From 989fd3e5f583877c15242f98df36c4cfd38f369d Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 4 Jun 2026 02:59:57 +0000
Subject: [PATCH 10/10] feat: route infiniop gemm through InfiniOps

---
 .gitmodules                       |   3 +
 src/infiniop/ops/gemm/operator.cc | 251 +-----------------------------
 src/infiniop/ops/gemm/operator.cu | 192 +++++++++++++++++++++++
 submodules/InfiniOps              |   1 +
 xmake.lua                         |  50 ++++--
 xmake/nvidia.lua                  |  28 +++-
 6 files changed, 264 insertions(+), 261 deletions(-)
 create mode 100644 src/infiniop/ops/gemm/operator.cu
 create mode 160000 submodules/InfiniOps

diff --git a/.gitmodules b/.gitmodules
index bca919479..535074641 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,3 +5,6 @@
 	path = third_party/nlohmann_json
 	url = https://github.com/nlohmann/json.git
 	branch = master
+[submodule "submodules/InfiniOps"]
+	path = submodules/InfiniOps
+	url = git@github.com:InfiniTensor/InfiniOps.git
diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc
index ac4e01e83..1d03dca67 100644
--- a/src/infiniop/ops/gemm/operator.cc
+++ b/src/infiniop/ops/gemm/operator.cc
@@ -1,250 +1 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/gemm.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/gemm_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
-#include "nvidia/gemm_nvidia.cuh"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/gemm_bang.h"
-#endif
-#ifdef ENABLE_ASCEND_API
-#include "ascend/gemm_ascend.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/gemm_metax.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/gemm_moore.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/gemm_kunlun.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateGemmDescriptor(
-    infiniopHandle_t handle,
-    infiniopGemmDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::gemm::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::gemm::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                         \
-            a_desc,                                                         \
-            b_desc)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
-        CREATE(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        CREATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopGetGemmWorkspaceSize(
-    infiniopGemmDescriptor_t desc,
-    size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                      \
-    case CASE:                                                                                    \
-        *size = reinterpret_cast<const op::gemm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
-        GET(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        GET(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef GET
-}
-
-__INFINI_C infiniStatus_t infiniopGemm(
-    infiniopGemmDescriptor_t desc,
-    void *workspace, size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    float alpha,
-    float beta,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::gemm::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size,                             \
-                        c, beta,                                               \
-                        a, b, alpha,                                           \
-                        stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
-        CALCULATE(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::gemm::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_ALI_API
-        DELETE(INFINI_DEVICE_ALI, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        DELETE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        DELETE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+// This operator is provided by InfiniOps (see `operator.cu`).
diff --git a/src/infiniop/ops/gemm/operator.cu b/src/infiniop/ops/gemm/operator.cu
new file mode 100644
index 000000000..774da314f
--- /dev/null
+++ b/src/infiniop/ops/gemm/operator.cu
@@ -0,0 +1,192 @@
+#include "../../handle.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infiniop/ops/gemm.h"
+
+#include <base/gemm.h>
+#include <config.h>
+#include <handle.h>
+#include <operator.h>
+#include <tensor.h>
+#include <torch/ops/gemm/gemm.h>
+
+#include <cstddef>
+#include <optional>
+#include <utility>
+#include <vector>
+
+namespace {
+
+infini::ops::DataType dataTypeFromInfiniDtype(infiniDtype_t dtype) {
+    switch (dtype) {
+    case INFINI_DTYPE_I8:
+        return infini::ops::DataType::kInt8;
+    case INFINI_DTYPE_I16:
+        return infini::ops::DataType::kInt16;
+    case INFINI_DTYPE_I32:
+        return infini::ops::DataType::kInt32;
+    case INFINI_DTYPE_I64:
+        return infini::ops::DataType::kInt64;
+    case INFINI_DTYPE_U8:
+        return infini::ops::DataType::kUInt8;
+    case INFINI_DTYPE_U16:
+        return infini::ops::DataType::kUInt16;
+    case INFINI_DTYPE_U32:
+        return infini::ops::DataType::kUInt32;
+    case INFINI_DTYPE_U64:
+        return infini::ops::DataType::kUInt64;
+    case INFINI_DTYPE_F16:
+        return infini::ops::DataType::kFloat16;
+    case INFINI_DTYPE_BF16:
+        return infini::ops::DataType::kBFloat16;
+    case INFINI_DTYPE_F32:
+        return infini::ops::DataType::kFloat32;
+    case INFINI_DTYPE_F64:
+        return infini::ops::DataType::kFloat64;
+    default:
+        return infini::ops::DataType::kFloat32;
+    }
+}
+
+infini::ops::Device::Type deviceTypeFromInfiniDevice(infiniDevice_t device) {
+    switch (device) {
+    case INFINI_DEVICE_CPU:
+        return infini::ops::Device::Type::kCpu;
+    case INFINI_DEVICE_NVIDIA:
+        return infini::ops::Device::Type::kNvidia;
+    case INFINI_DEVICE_CAMBRICON:
+        return infini::ops::Device::Type::kCambricon;
+    case INFINI_DEVICE_ASCEND:
+        return infini::ops::Device::Type::kAscend;
+    case INFINI_DEVICE_METAX:
+        return infini::ops::Device::Type::kMetax;
+    case INFINI_DEVICE_MOORE:
+        return infini::ops::Device::Type::kMoore;
+    case INFINI_DEVICE_ILUVATAR:
+        return infini::ops::Device::Type::kIluvatar;
+    case INFINI_DEVICE_KUNLUN:
+        return infini::ops::Device::Type::kKunlun;
+    case INFINI_DEVICE_HYGON:
+        return infini::ops::Device::Type::kHygon;
+    case INFINI_DEVICE_QY:
+        return infini::ops::Device::Type::kQy;
+    default:
+        return infini::ops::Device::Type::kCpu;
+    }
+}
+
+struct TensorMeta {
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> strides;
+    infini::ops::DataType dtype;
+};
+
+TensorMeta makeTensorMeta(infiniopTensorDescriptor_t desc) {
+    return TensorMeta{desc->shape(), desc->strides(), dataTypeFromInfiniDtype(desc->dtype())};
+}
+
+class InfiniOpsGemmDescriptor final : public InfiniopDescriptor {
+public:
+    InfiniOpsGemmDescriptor(infiniopHandle_t handle,
+                            infiniopTensorDescriptor_t c_desc,
+                            infiniopTensorDescriptor_t a_desc,
+                            infiniopTensorDescriptor_t b_desc)
+        : InfiniopDescriptor{handle->device, handle->device_id},
+          device(deviceTypeFromInfiniDevice(handle->device), handle->device_id),
+          c(makeTensorMeta(c_desc)),
+          a(makeTensorMeta(a_desc)),
+          b(makeTensorMeta(b_desc)) {}
+
+    infini::ops::Tensor tensor(const TensorMeta &meta, void *data) const {
+        return infini::ops::Tensor(data, meta.shape, meta.dtype, device, meta.strides);
+    }
+
+    infini::ops::Tensor tensor(const TensorMeta &meta, const void *data) const {
+        return tensor(meta, const_cast<void *>(data));
+    }
+
+    infini::ops::Device device;
+    TensorMeta c;
+    TensorMeta a;
+    TensorMeta b;
+};
+
+bool isExplicitTorchGemmDevice(infiniDevice_t device) {
+    switch (device) {
+    case INFINI_DEVICE_CPU:
+    case INFINI_DEVICE_NVIDIA:
+    case INFINI_DEVICE_CAMBRICON:
+    case INFINI_DEVICE_ASCEND:
+    case INFINI_DEVICE_METAX:
+    case INFINI_DEVICE_MOORE:
+    case INFINI_DEVICE_ILUVATAR:
+    case INFINI_DEVICE_KUNLUN:
+    case INFINI_DEVICE_HYGON:
+    case INFINI_DEVICE_QY:
+        return true;
+    default:
+        return false;
+    }
+}
+
+} // namespace
+
+__INFINI_C infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle,
+                                                       infiniopGemmDescriptor_t *desc_ptr,
+                                                       infiniopTensorDescriptor_t c,
+                                                       const infiniopTensorDescriptor_t a,
+                                                       const infiniopTensorDescriptor_t b) {
+    if (!isExplicitTorchGemmDevice(handle->device)) {
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    *desc_ptr = new InfiniOpsGemmDescriptor(handle, c, a, b);
+    return INFINI_STATUS_SUCCESS;
+}
+
+__INFINI_C infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size) {
+    (void)desc;
+    *size = 0;
+    return INFINI_STATUS_SUCCESS;
+}
+
+__INFINI_C infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc,
+                                       void *workspace,
+                                       size_t workspace_size,
+                                       void *c,
+                                       const void *a,
+                                       const void *b,
+                                       float alpha,
+                                       float beta,
+                                       void *stream) {
+    auto *gemm_desc = reinterpret_cast<InfiniOpsGemmDescriptor *>(desc);
+    if (!isExplicitTorchGemmDevice(gemm_desc->device_type)) {
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    infini::ops::Handle handle;
+    handle.set_stream(stream);
+    handle.set_workspace(workspace);
+    handle.set_workspace_size_in_bytes(workspace_size);
+
+    infini::ops::Config config;
+    config.set_implementation_index(2);
+
+    infini::ops::Operator<infini::ops::Gemm>::Call(
+        handle,
+        config,
+        gemm_desc->tensor(gemm_desc->a, a),
+        gemm_desc->tensor(gemm_desc->b, b),
+        std::optional<float>(alpha),
+        std::optional<float>(beta),
+        std::optional<int>{},
+        std::optional<int>{},
+        gemm_desc->tensor(gemm_desc->c, c));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
+    delete reinterpret_cast<InfiniOpsGemmDescriptor *>(desc);
+    return INFINI_STATUS_SUCCESS;
+}
diff --git a/submodules/InfiniOps b/submodules/InfiniOps
new file mode 160000
index 000000000..9444f9c3d
--- /dev/null
+++ b/submodules/InfiniOps
@@ -0,0 +1 @@
+Subproject commit 9444f9c3d2b98084fc150252531239137ad2519b
diff --git a/xmake.lua b/xmake.lua
index 4b9a6e3eb..ff18e7674 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,6 +1,5 @@
 add_rules("mode.debug", "mode.release")
 add_requires("boost", {configs = {stacktrace = true}})
-add_requires("pybind11")
 
 -- Define color codes
 local GREEN = '\27[0;32m'
@@ -54,6 +53,18 @@ option_end()
 
 if has_config("nv-gpu") then
     add_defines("ENABLE_NVIDIA_API")
+    local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
+    if CUDA_ROOT == nil then
+        if os.isdir("/usr/local/cuda") then
+            CUDA_ROOT = "/usr/local/cuda"
+        elseif os.isdir("/usr/local/cuda-13.0") then
+            CUDA_ROOT = "/usr/local/cuda-13.0"
+        end
+    end
+    if CUDA_ROOT ~= nil then
+        add_includedirs(CUDA_ROOT .. "/include")
+        add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include")
+    end
     includes("xmake/nvidia.lua")
 end
 
@@ -273,9 +284,21 @@ end
 option("infiniops")
     set_default("")
     set_showmenu(true)
-    set_description("Path to InfiniOps project root. If set, operator dispatch files are generated from InfiniOps.")
+    set_description("Path to InfiniOps project root. Defaults to submodules/InfiniOps when present.")
 option_end()
 
+function get_infiniops_path()
+    local configured = get_config("infiniops")
+    if configured and configured ~= "" then
+        return path.absolute(configured)
+    end
+
+    local bundled = path.join(os.projectdir(), "submodules", "InfiniOps")
+    if os.isdir(bundled) then
+        return bundled
+    end
+end
+
 target("infini-utils")
     set_kind("static")
     on_install(function (target) end)
@@ -390,26 +413,35 @@ target("infiniop")
         add_deps("infiniop-hygon")
     end
     set_languages("cxx17")
-    if get_config("infiniops") and get_config("infiniops") ~= "" then
-        add_includedirs(get_config("infiniops") .. "/src")
-        add_includedirs(get_config("infiniops") .. "/generated/include")
-        add_linkdirs(get_config("infiniops") .. "/build/src")
+    local infiniops_path = get_infiniops_path()
+    if infiniops_path then
+        add_includedirs(infiniops_path .. "/src")
+        add_includedirs(infiniops_path .. "/generated/include")
+        add_linkdirs(infiniops_path .. "/build/src")
         add_shflags("-Wl,--no-as-needed,-linfiniops,--as-needed", {force = true})
-        add_rpathdirs(get_config("infiniops") .. "/build/src")
+        add_rpathdirs(infiniops_path .. "/build/src")
     end
     add_files("src/infiniop/devices/handle.cc")
     add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
     if not has_config("nv-gpu") then
         -- On non-NVIDIA builds, compile InfiniOps-synced `.cu` files as plain C++
         -- (NVIDIA includes are guarded by `#ifdef ENABLE_NVIDIA_API`).
-        add_files("src/infiniop/ops/*/operator.cu", {force = {languages = "cxx17"}})
+        add_files("src/infiniop/ops/gemm/operator.cu", {force = {languages = "cxx17"}})
     end
     add_files("src/infiniop/*.cc")
 
     before_build(function (target)
         import("core.project.config")
         local infiniops_path = config.get("infiniops")
-        if infiniops_path and infiniops_path ~= "" and not has_config("nv-gpu") then
+        if infiniops_path and infiniops_path ~= "" then
+            infiniops_path = path.absolute(infiniops_path)
+        else
+            local bundled = path.join(os.projectdir(), "submodules", "InfiniOps")
+            if os.isdir(bundled) then
+                infiniops_path = bundled
+            end
+        end
+        if infiniops_path and not has_config("nv-gpu") then
             -- CPU-only build: run sync here (NVIDIA builds sync via `infiniop-nvidia`).
             os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", "cpu"})
         end
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index 06ef1cb72..7381471bf 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -1,3 +1,16 @@
+local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
+if CUDA_ROOT == nil then
+    if os.isdir("/usr/local/cuda") then
+        CUDA_ROOT = "/usr/local/cuda"
+    elseif os.isdir("/usr/local/cuda-13.0") then
+        CUDA_ROOT = "/usr/local/cuda-13.0"
+    end
+end
+if CUDA_ROOT ~= nil then
+    add_includedirs(CUDA_ROOT .. "/include")
+    add_includedirs(CUDA_ROOT .. "/targets/x86_64-linux/include")
+end
+
 local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
 if CUDNN_ROOT ~= nil then
     add_includedirs(CUDNN_ROOT .. "/include")
@@ -44,13 +57,24 @@ target("infiniop-nvidia")
         import("core.project.config")
         local infiniops_path = config.get("infiniops")
         if infiniops_path and infiniops_path ~= "" then
+            infiniops_path = path.absolute(infiniops_path)
+        else
+            local bundled = path.join(os.projectdir(), "submodules", "InfiniOps")
+            if os.isdir(bundled) then
+                infiniops_path = bundled
+            end
+        end
+        if infiniops_path then
             target:add("includedirs", infiniops_path .. "/src")
             target:add("includedirs", infiniops_path .. "/generated/include")
-            local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu"))
+            target:add("linkdirs", infiniops_path .. "/build/src")
+            target:add("shflags", "-Wl,--no-as-needed,-linfiniops,--as-needed", {force = true})
+            target:add("rpathdirs", infiniops_path .. "/build/src")
+            local cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/gemm/operator.cu"))
             if #cu_files == 0 then
                 local devices = {"cpu", "nvidia"}
                 os.execv("python", {"scripts/sync_infiniops.py", infiniops_path, "--devices", table.unpack(devices)})
-                cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/*/operator.cu"))
+                cu_files = os.files(path.join(os.projectdir(), "src/infiniop/ops/gemm/operator.cu"))
             end
             for _, f in ipairs(cu_files) do
                 target:add("files", f)