QuEST-Kit · TysonRayJones · Oct 21, 2025 · Oct 23, 2025 · Oct 27, 2025 · Oct 28, 2025
diff --git a/.github/scripts/setup_cuda.ps1 b/.github/scripts/setup_cuda.ps1
@@ -108,4 +108,4 @@ Write-Output "CMAKE_CUDA_COMPILER=$dst\bin\nvcc.exe" >> $env:GITHUB_ENV
 Write-Output "NVCC_APPEND_FLAGS=-allow-unsupported-compiler" >> $env:GITHUB_ENV
 
 Write-Output "CUDA_VERSION=$CUDA_VERSION_FULL" >> $env:GITHUB_ENV
-Write-Output "Setup completed."
+Write-Output "Setup completed."
diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
@@ -50,9 +50,9 @@ jobs:
         run: >
           cmake -B ${{ env.build_dir }}
           -DCMAKE_CXX_COMPILER=clang++
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
-          -DFLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
           -DCMAKE_CXX_FLAGS="${{ env.sanitiser_flags }}"
           -DCMAKE_EXE_LINKER_FLAGS="${{ env.sanitiser_flags }}"
 
@@ -92,9 +92,9 @@ jobs:
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
-          -DFLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
 
       - name: Compile QuEST
         run: cmake --build ${{ env.build_dir }} --parallel
@@ -147,8 +147,8 @@ jobs:
         run: >
           cmake -B .
           -DCMAKE_BUILD_TYPE=Release
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
           -DCMAKE_CXX_FLAGS="--coverage"
           -DCMAKE_EXE_LINKER_FLAGS="--coverage"
 

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -39,14 +39,21 @@ jobs:
   # test only compilation succeeds (no execution)
   build-test:
     name: >
-      ${{ matrix.os == 'ubuntu-latest' && 'Linux' || matrix.os == 'macos-latest' && 'MacOS' || 'Windows' }}
+      ${{ startsWith(matrix.os, 'ubuntu' ) && 'Linux'   || 
+          startsWith(matrix.os, 'macos'  ) && 'MacOS'   ||
+          startsWith(matrix.os, 'windows') && 'Windows' ||
+                                              'Unknown' }}
+      ${{ endsWith(matrix.os, '15-intel') && '(Intel 2015)' || '' }}
+      ${{ endsWith(matrix.os, '26-intel') && '(Intel 2026)' || '' }}
       [${{ matrix.precision }}]
       ${{ matrix.omp       == 'ON' && 'OMP'  || '' }}
       ${{ matrix.mpi       == 'ON' && 'MPI'  || '' }}
       ${{ matrix.mpi       == 'ON' && format('({0})',matrix.mpilib) || '' }}
       ${{ matrix.cuda      == 'ON' && 'CUDA' || '' }}
       ${{ matrix.hip       == 'ON' && 'HIP'  || '' }}
       ${{ matrix.cuquantum == 'ON' && 'CUQ'  || '' }}
+      ${{ matrix.adios2    == 'ON' && 'CKPT' || '' }}
+      ${{ matrix.bmi2      == 'ON' && 'BMI'  || '' }}
 
     runs-on: ${{ matrix.os }}
 
@@ -58,15 +65,18 @@ jobs:
       # (causes CUDA and MPI installation to fail on Windows)
       max-parallel: 8
 
-      # compile QuEST with all combinations of below flags
+      # compile QuEST with all combinations of below flags (Intel runners for BMI2 instrinsics);
+      # incredibly, this (with exclusions below) achieves 256 combos, which is the Github limit!
       matrix:
-        os: [windows-latest, ubuntu-latest, macos-latest]
+        os: [windows-2022, ubuntu-latest, macos-latest, macos-15-intel, macos-26-intel]
         precision: [1, 2, 4]
         omp:       [ON, OFF]
         mpi:       [ON, OFF]
         cuda:      [ON, OFF]
         hip:       [ON, OFF]
         cuquantum: [ON, OFF]
+        adios2:    [ON, OFF]
+        bmi2:      [ON, OFF]
         mpilib:    ['', 'mpich', 'ompi', 'impi', 'msmpi']
 
         # disable deprecated API on MSVC, and assign unique compilers,
@@ -80,7 +90,13 @@ jobs:
           - os: macos-latest
             compiler: clang++
             deprecated: ON
-          - os: windows-latest
+          - os: macos-15-intel
+            compiler: clang++
+            deprecated: ON
+          - os: macos-26-intel
+            compiler: clang++
+            deprecated: ON
+          - os: windows-2022
             compiler: cl
             deprecated: OFF
 
@@ -102,15 +118,43 @@ jobs:
           # cannot use GPU on MacOS
           - cuda: ON
             os: macos-latest
+          - cuda: ON
+            os: macos-15-intel
+          - cuda: ON
+            os: macos-26-intel
           - hip: ON
             os: macos-latest
+          - hip: ON
+            os: macos-15-intel
+          - hip: ON
+            os: macos-26-intel
 
           # cannot use cuquantum on Windows or MacOS
           - cuquantum: ON
-            os: windows-latest
+            os: windows-2022
+          - cuquantum: ON
+            os: macos-latest
+          - cuquantum: ON
+            os: macos-15-intel
           - cuquantum: ON
+            os: macos-26-intel
+
+          # cannot use BMI2 on non-Intel MacOS
+          - bmi2: ON
             os: macos-latest
 
+          # use ONLY BMI2 on Intel MacOS, just to shrink matrix (Github imposes 256 max)
+          - bmi2: OFF
+            os: macos-15-intel
+          - bmi2: OFF
+            os: macos-26-intel
+
+          # do not combine BMI2 with MPI or ADIOS2 (they don't interact), just to shrink matrix
+          - bmi2: ON
+            mpi: ON
+          - bmi2: ON
+            adios2: ON
+
           # don't enumerate MPI libraries when not using MPI
           - mpi: OFF
             mpilib: 'mpich' # MPICH
@@ -130,14 +174,22 @@ jobs:
             mpilib: 'msmpi'    # MacOS:   [MPICH, OpenMPI]
           - os: macos-latest
             mpilib: 'impi'
-          - os: windows-latest
+          - os: macos-15-intel
+            mpilib: 'msmpi'
+          - os: macos-15-intel
+            mpilib: 'impi'
+          - os: macos-26-intel
+            mpilib: 'msmpi'
+          - os: macos-26-intel
+            mpilib: 'impi'
+          - os: windows-2022
             mpilib: 'mpich'    # Windows: [Intel MPI, MS MPI]
-          - os: windows-latest
+          - os: windows-2022
             mpilib: 'ompi'
 
           # cannot presently install HIP on Windows CI (times out)
           - hip: ON
-            os: windows-latest
+            os: windows-2022
 
           # cannot presently compile HIP + MPI; the linker fails with
           # "undefined reference to 'vtable for thrust::system::system_error'
@@ -211,42 +263,60 @@ jobs:
       - name: Install ROCm
         if: ${{ matrix.hip == 'ON' }}
         run: |
+          sudo apt update
+          sudo apt upgrade
           sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)"
-          sudo apt install python3-setuptools python3-wheel
-          sudo usermod -a -G render,video $USER
-          wget https://repo.radeon.com/amdgpu-install/6.3.3/ubuntu/noble/amdgpu-install_6.3.60303-1_all.deb
-          sudo apt install ./amdgpu-install_6.3.60303-1_all.deb
+          # Make the directory if it doesn't exist yet.
+          # This location is recommended by the distribution maintainers.
+          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          # Download the key, convert the signing-key to a full
+          # keyring required by apt and store in the keyring directory
+          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
+          gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+          sudo tee /etc/apt/sources.list.d/amdgpu.list << EOF
+          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.2 noble main
+          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.2/ubuntu noble main
+          EOF
+          sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
+          Package: *
+          Pin: release o=repo.radeon.com
+          Pin-Priority: 600
+          EOF
           sudo apt update
-          sudo apt install amdgpu-dkms rocm
+          sudo apt autoremove
+          sudo apt install rocm rocm-hip-sdk rocm-hip-runtime-dev
           echo "${{ env.rocm_path }}" >> $GITHUB_PATH
 
       # invoke cmake, disabling LTO (it duplicates symbols with CUDA + MPI)
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DBUILD_EXAMPLES=ON
-          -DENABLE_TESTING=ON
-          -DFLOAT_PRECISION=${{ matrix.precision }}
-          -DENABLE_DEPRECATED_API=${{ matrix.deprecated }}
-          -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }}
-          -DENABLE_MULTITHREADING=${{ matrix.omp }}
-          -DENABLE_DISTRIBUTION=${{ matrix.mpi }}
-          -DENABLE_CUDA=${{ matrix.cuda }}
-          -DENABLE_HIP=${{ matrix.hip }}
-          -DENABLE_CUQUANTUM=${{ matrix.cuquantum }}
+          -DQUEST_BUILD_EXAMPLES=ON
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.deprecated }}
+          -DQUEST_DISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }}
+          -DQUEST_ENABLE_OMP=${{ matrix.omp }}
+          -DQUEST_ENABLE_MPI=${{ matrix.mpi }}
+          -DQUEST_ENABLE_CUDA=${{ matrix.cuda }}
+          -DQUEST_ENABLE_HIP=${{ matrix.hip }}
+          -DQUEST_ENABLE_CUQUANTUM=${{ matrix.cuquantum }}
+          -DQUEST_ENABLE_ADIOS2=${{ matrix.adios2 }}
+          -DQUEST_ENABLE_BMI2=${{ matrix.bmi2 }}
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
           -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }}
           -DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
 
-      # force 'Release' build (needed by MSVC to enable optimisations)
+      # force 'Release' build (needed by MSVC to enable optimisations),
+      # and force serial compilation to avoid ADIOS2 OOM error
       - name: Compile
-        run: cmake --build ${{ env.build_dir }} --config Release --parallel
+        run: cmake --build ${{ env.build_dir }} --config Release --parallel 1
 
       # run all compiled isolated examples to test for link-time errors,
       # continuing if any fail (since some deliberately fail)
       - name: Run isolated examples (Windows)
-        if: ${{ matrix.os == 'windows-latest' }}
+        if: ${{ matrix.os == 'windows-2022' }}
         working-directory: ${{ env.isolated_dir }}/Release/
         shell: pwsh
         run: |
@@ -256,7 +326,7 @@ jobs:
             & $_.FullName
           }
       - name: Run isolated examples (Unix)
-        if: ${{ matrix.os != 'windows-latest' }}
+        if: ${{ matrix.os != 'windows-2022' }}
         working-directory: ${{ env.isolated_dir }}
         run: |
           for fn in *_c *_cpp; do
@@ -266,7 +336,7 @@ jobs:
 
       # run all compiled 'automated' examples
       - name: Run automated examples (Windows)
-        if: ${{ matrix.os == 'windows-latest' }}
+        if: ${{ matrix.os == 'windows-2022' }}
         working-directory: ${{ env.automated_dir }}/Release/
         shell: pwsh
         run: |
@@ -276,7 +346,7 @@ jobs:
             & $_.FullName
           }
       - name: Run automated examples (Unix)
-        if: ${{ matrix.os != 'windows-latest' }}
+        if: ${{ matrix.os != 'windows-2022' }}
         working-directory: ${{ env.automated_dir }}
         run: |
           for fn in *_c *_cpp; do

diff --git a/.github/workflows/test_edgecases.yaml b/.github/workflows/test_edgecases.yaml
@@ -0,0 +1,111 @@
+# Some free, ad hoc tests which test only specific
+# functions in specific regimes, covering edge cases.
+# These are intended as a stop gap in the interim to
+# an improved test harness.
+#
+# We here test only CPU functions which have
+# template parameters (like NumCtrls or NumTargs)
+# which inform an optimised definition for MORE than
+# 5 involved qubits. For example, multiControlledSWAP
+# has optimised treatment of up to 5 control qubits,
+# while needing 2 target qubits; so requires a Qureg
+# of at least 8 qubits to trigger its unoptimised
+# version (receiving 6 control qubits and 2 targets).
+# 
+# We alas do NOT here test the fully-unoptimised
+# versions of multi-ctrl multi-targ functions (such
+# as applyMultiControlledCompMatr), because that
+# requires reaching 6 ctrls + 6 targs, and ergo a
+# Qureg of at least 12 qubits. So large a Hilbert
+# space is alas too slow for the reference maths of
+# our unit tests. Curse ye, exponential!
+# 
+# @author Tyson Jones
+
+name: test (edge cases)
+
+
+on:
+  push:
+    branches:
+      - main
+      - devel
+  pull_request:
+    branches:
+      - main
+      - devel
+
+
+jobs:
+
+  # run only some non-parallelised v4 unit tests at default (double) precision
+  serial-unit-test:
+    name: >
+      ${{ startsWith(matrix.os, 'ubuntu' ) && 'Linux'   || 
+          startsWith(matrix.os, 'macos'  ) && 'MacOS'   ||
+          startsWith(matrix.os, 'windows') && 'Windows' ||
+                                              'Unknown' }}
+      ${{ endsWith(matrix.os, '26-intel') && '(Intel 2026)' || '' }}
+      serial
+      ${{ matrix.bmi2      == 'ON' && '(BMI)'  || '' }}
+      edgecases
+
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      # continue other jobs if any fail
+      fail-fast: false
+
+      # we will compile QuEST with all precisions but no parallelisation (though with Intel BMI2)
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest, macos-26-intel]
+        bmi2:      [ON, OFF]
+
+        exclude:
+          # cannot use BMI2 on non-Intel MacOS
+          - bmi2: ON
+            os: macos-latest
+
+    # constants; all tests are non-comprehensive for speed
+    env:
+      build_dir: "build"
+      full_tests_regex: "Swap|CompMatr1|CompMatr2"
+      partial_tests_regex: "^applyMultiStateControlled(CompMatr|DiagMatr|PauliStr|PauliGadget)$"
+      QUEST_TEST_NUM_QUBITS_IN_QUREG: 8
+      QUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS: 1
+      QUEST_TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS: 1
+
+    # perform the job
+    steps:
+      - name: Get QuEST
+        uses: actions/checkout@main
+
+      # compile serial unit tests
+      - name: Configure CMake
+        run: >
+          cmake -B ${{ env.build_dir }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
+          -DQUEST_ENABLE_BMI2=${{ matrix.bmi2 }}
+
+      # force 'Release' build (needed by MSVC to enable optimisations)
+      - name: Compile
+        run: cmake --build ${{ env.build_dir }} --config Release
+
+      # test statevector and density-matrix functions which exhibit
+      # optimisations on up to 5 ctrl + 2 target qubits, with 8-qubit
+      # Quregs, in order to test the non-optimised implementation
+      - name: 8-qubit edge-cases (full coverage)
+        run: ctest -C Release -R "${{ env.full_tests_regex }}"
+        working-directory: ${{ env.build_dir }}
+
+      # it is our desire to test the "partial_tests" functions with
+      # 12-qubit Quregs, so that NumCtrls=6 while NumTargs=6 could be
+      # achieved, testing the fully unoptimised versions - but alas
+      # it's intractable for our slow reference maths! So we instead
+      # at least test the NumCtrls=6 and NumTargs=6 cases independently,
+      # requiring only 7-qubit Quregs, but we use 8 for consistency
+      # with above
+      - name: 8-qubit edge-cases (partial coverage)
+        run: ctest -C Release -R "${{ env.partial_tests_regex }}"
+        working-directory: ${{ env.build_dir }}