Snapchat
diff --git a/‎gigl/csrc/distributed/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎gigl/csrc/distributed/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎gigl/csrc/distributed/ppr_forward_push.cpp‎
Lines changed: 247 additions & 0 deletions b/‎gigl/csrc/distributed/ppr_forward_push.cpp‎
Lines changed: 247 additions & 0 deletions
diff --git a/‎gigl/csrc/distributed/ppr_forward_push.h‎
Lines changed: 121 additions & 0 deletions b/‎gigl/csrc/distributed/ppr_forward_push.h‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎…uted/cpp_extensions/ppr_forward_push.pyi‎ ‎…gl/csrc/distributed/ppr_forward_push.pyi‎gigl/distributed/cpp_extensions/ppr_forward_push.pyi renamed to gigl/csrc/distributed/ppr_forward_push.pyi b/‎…uted/cpp_extensions/ppr_forward_push.pyi‎ ‎…gl/csrc/distributed/ppr_forward_push.pyi‎gigl/distributed/cpp_extensions/ppr_forward_push.pyi renamed to gigl/csrc/distributed/ppr_forward_push.pyi
@@ -0,0 +1,9 @@
+try:
+    from gigl.csrc.distributed.ppr_forward_push import PPRForwardPushState
+except ImportError as e:
+    raise ImportError(
+        "PPR C++ extension not compiled. "
+        "Run `make build_cpp_extensions` from the GiGL root to build it."
+    ) from e
+
+__all__ = ["PPRForwardPushState"]
@@ -0,0 +1,247 @@
+#include "ppr_forward_push.h"
+
+PPRForwardPushState::PPRForwardPushState(
+    torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
+    double requeue_threshold_factor,
+    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
+    std::vector<int32_t> edge_type_to_dst_ntype_id, std::vector<torch::Tensor> degree_tensors)
+    : alpha_(alpha),
+      one_minus_alpha_(1.0 - alpha),
+      requeue_threshold_factor_(requeue_threshold_factor),
+      // std::move transfers ownership of each vector into the member variable
+      // without copying its contents — equivalent to Python's list hand-off
+      // when you no longer need the original.
+      node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
+      edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
+      degree_tensors_(std::move(degree_tensors)) {
+    TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D");
+    batch_size_ = static_cast<int32_t>(seed_nodes.size(0));
+    num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
+
+    // Allocate per-seed, per-node-type tables.
+    // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python.
+    ppr_scores_.assign(batch_size_,
+                       std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+    residuals_.assign(batch_size_,
+                      std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+    queue_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
+    queued_nodes_.assign(batch_size_,
+                         std::vector<std::unordered_set<int32_t>>(num_node_types_));
+
+    // accessor<dtype, ndim>() returns a typed view into the tensor's data that
+    // supports [i] indexing with bounds checking in debug builds.
+    auto acc = seed_nodes.accessor<int64_t, 1>();
+    num_nodes_in_queue_ = batch_size_;
+    for (int32_t i = 0; i < batch_size_; ++i) {
+        int32_t seed = static_cast<int32_t>(acc[i]);
+        // PPR initialisation: each seed starts with residual = alpha (the
+        // restart probability).  The first push will move alpha into ppr_score
+        // and distribute (1-alpha)*alpha to the seed's neighbors.
+        residuals_[i][seed_node_type_id][seed] = alpha_;
+        queue_[i][seed_node_type_id].insert(seed);
+    }
+}
+
+std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::drain_queue() {
+    if (num_nodes_in_queue_ == 0) {
+        return std::nullopt;
+    }
+
+    // Reset the snapshot from the previous iteration.
+    for (int32_t s = 0; s < batch_size_; ++s)
+        for (auto& qs : queued_nodes_[s])
+            qs.clear();
+
+    // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for
+    // edge type eid this round.  Using a set deduplicates nodes that appear
+    // in multiple seeds' queues: we only fetch each (node, etype) pair once.
+    std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
+
+    for (int32_t s = 0; s < batch_size_; ++s) {
+        for (int32_t nt = 0; nt < num_node_types_; ++nt) {
+            if (queue_[s][nt].empty())
+                continue;
+
+            // Move the live queue into the snapshot (no data copy — O(1)).
+            queued_nodes_[s][nt] = std::move(queue_[s][nt]);
+            queue_[s][nt].clear();
+            num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
+
+            for (int32_t node_id : queued_nodes_[s][nt]) {
+                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                    if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) {
+                        nodes_to_lookup[eid].insert(node_id);
+                    }
+                }
+            }
+        }
+    }
+
+    std::unordered_map<int32_t, torch::Tensor> result;
+    for (auto& [eid, node_set] : nodes_to_lookup) {
+        std::vector<int64_t> ids(node_set.begin(), node_set.end());
+        result[eid] = torch::tensor(ids, torch::kLong);
+    }
+    return result;
+}
+
+void PPRForwardPushState::push_residuals(
+    const std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
+        fetched_by_etype_id) {
+    // Step 1: Unpack the input map into a C++ map keyed by pack_key(node_id, etype_id)
+    // for fast lookup during the residual-push loop below.
+    std::unordered_map<uint64_t, std::vector<int32_t>> fetched;
+    for (const auto& [eid, tup] : fetched_by_etype_id) {
+        const auto& node_ids_t = std::get<0>(tup);
+        const auto& flat_nbrs_t = std::get<1>(tup);
+        const auto& counts_t = std::get<2>(tup);
+
+        // accessor<int64_t, 1>() gives a bounds-checked, typed 1-D view into
+        // each tensor's data — equivalent to iterating over a NumPy array.
+        auto node_acc = node_ids_t.accessor<int64_t, 1>();
+        auto nbr_acc = flat_nbrs_t.accessor<int64_t, 1>();
+        auto cnt_acc = counts_t.accessor<int64_t, 1>();
+
+        // Walk the flat neighbor list, slicing out each node's neighbors using
+        // the running offset into the concatenated flat buffer.
+        int64_t offset = 0;
+        for (int64_t i = 0; i < node_ids_t.size(0); ++i) {
+            int32_t nid = static_cast<int32_t>(node_acc[i]);
+            int64_t count = cnt_acc[i];
+            std::vector<int32_t> nbrs(count);
+            for (int64_t j = 0; j < count; ++j)
+                nbrs[j] = static_cast<int32_t>(nbr_acc[offset + j]);
+            fetched[pack_key(nid, eid)] = std::move(nbrs);
+            offset += count;
+        }
+    }
+
+    // Step 2: For every node that was in the queue (captured in queued_nodes_
+    // by drain_queue()), apply one PPR push step:
+    //   a. Absorb residual into the PPR score.
+    //   b. Distribute (1-alpha) * residual equally to each neighbor.
+    //   c. Enqueue any neighbor whose residual now exceeds the requeue threshold.
+    for (int32_t s = 0; s < batch_size_; ++s) {
+        for (int32_t nt = 0; nt < num_node_types_; ++nt) {
+            if (queued_nodes_[s][nt].empty())
+                continue;
+
+            for (int32_t src : queued_nodes_[s][nt]) {
+                auto& src_res = residuals_[s][nt];
+                auto it = src_res.find(src);
+                double res = (it != src_res.end()) ? it->second : 0.0;
+
+                // a. Absorb: move residual into the PPR score.
+                ppr_scores_[s][nt][src] += res;
+                src_res[src] = 0.0;
+
+                int32_t total_deg = get_total_degree(src, nt);
+                // Destination-only nodes absorb residual but do not push further.
+                if (total_deg == 0)
+                    continue;
+
+                // b. Distribute: each neighbor receives an equal share.
+                double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_deg);
+
+                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                    // Invariant: fetched and neighbor_cache_ are mutually exclusive for
+                    // any given (node, etype) key within one iteration.  drain_queue()
+                    // only requests a fetch for nodes absent from neighbor_cache_, so a
+                    // key is in at most one of the two.
+                    const std::vector<int32_t>* nbr_list = nullptr;
+                    auto fi = fetched.find(pack_key(src, eid));
+                    if (fi != fetched.end()) {
+                        nbr_list = &fi->second;
+                    } else {
+                        auto ci = neighbor_cache_.find(pack_key(src, eid));
+                        if (ci != neighbor_cache_.end())
+                            nbr_list = &ci->second;
+                    }
+                    if (!nbr_list || nbr_list->empty())
+                        continue;
+
+                    int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
+
+                    // c. Accumulate residual for each neighbor and re-enqueue if threshold
+                    // exceeded.
+                    for (int32_t nbr : *nbr_list) {
+                        residuals_[s][dst_nt][nbr] += res_per_nbr;
+
+                        double threshold = requeue_threshold_factor_ *
+                                           static_cast<double>(get_total_degree(nbr, dst_nt));
+
+                        if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
+                            residuals_[s][dst_nt][nbr] >= threshold) {
+                            queue_[s][dst_nt].insert(nbr);
+                            ++num_nodes_in_queue_;
+
+                            // Promote neighbor lists to the persistent cache: this node will
+                            // be processed next iteration, so caching avoids a re-fetch.
+                            for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) {
+                                uint64_t pk = pack_key(nbr, peid);
+                                if (neighbor_cache_.find(pk) == neighbor_cache_.end()) {
+                                    auto pfi = fetched.find(pk);
+                                    if (pfi != fetched.end())
+                                        neighbor_cache_[pk] = pfi->second;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+PPRForwardPushState::extract_top_k(int32_t max_ppr_nodes) {
+    std::unordered_set<int32_t> active;
+    for (int32_t s = 0; s < batch_size_; ++s)
+        for (int32_t nt = 0; nt < num_node_types_; ++nt)
+            if (!ppr_scores_[s][nt].empty())
+                active.insert(nt);
+
+    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> result;
+    for (int32_t nt : active) {
+        std::vector<int64_t> flat_ids;
+        std::vector<float> flat_weights;
+        std::vector<int64_t> valid_counts;
+
+        for (int32_t s = 0; s < batch_size_; ++s) {
+            const auto& scores = ppr_scores_[s][nt];
+            int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
+            if (k > 0) {
+                std::vector<std::pair<int32_t, double>> items(scores.begin(), scores.end());
+                std::partial_sort(
+                    items.begin(), items.begin() + k, items.end(),
+                    [](const auto& a, const auto& b) { return a.second > b.second; });
+
+                for (int32_t i = 0; i < k; ++i) {
+                    flat_ids.push_back(static_cast<int64_t>(items[i].first));
+                    // Cast to float32 for output; internal scores stay double to
+                    // avoid accumulated rounding errors in the push loop.
+                    flat_weights.push_back(static_cast<float>(items[i].second));
+                }
+            }
+            valid_counts.push_back(static_cast<int64_t>(k));
+        }
+
+        result[nt] = {torch::tensor(flat_ids, torch::kLong),
+                      torch::tensor(flat_weights, torch::kFloat),
+                      torch::tensor(valid_counts, torch::kLong)};
+    }
+    return result;
+}
+
+int32_t PPRForwardPushState::get_total_degree(int32_t node_id, int32_t ntype_id) const {
+    if (ntype_id >= static_cast<int32_t>(degree_tensors_.size()))
+        return 0;
+    const auto& t = degree_tensors_[ntype_id];
+    if (t.numel() == 0)
+        return 0;
+    TORCH_CHECK(node_id < static_cast<int32_t>(t.size(0)), "Node ID ", node_id,
+                " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0),
+                "). This indicates corrupted graph data or a sampler bug.");
+    // data_ptr<int32_t>() returns a raw C pointer to the tensor's int32 data buffer.
+    return t.data_ptr<int32_t>()[node_id];
+}
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <torch/torch.h>
+
+#include <algorithm>      // std::partial_sort, std::min
+#include <cstdint>        // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t
+#include <optional>       // std::optional for nullable return values
+#include <tuple>          // std::tuple for multi-value returns
+#include <unordered_map>  // std::unordered_map — like Python dict, O(1) average lookup
+#include <unordered_set>  // std::unordered_set — like Python set, O(1) average lookup
+#include <vector>         // std::vector — like Python list, contiguous in memory
+
+// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash
+// map key.  A single 64-bit integer is cheaper to hash than a pair of two
+// integers (std::unordered_map has no built-in pair hash).
+//
+// Bit layout:
+//   bits 63–32: node_id  (upper half)
+//   bits 31– 0: etype_id (lower half)
+//
+// Both inputs are cast through uint32_t before packing.  Without this, a
+// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full
+// 64-bit value, corrupting the upper bits when shifted.  Reinterpreting as
+// uint32_t first treats the bit pattern as-is (no sign extension).
+static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
+    return (static_cast<uint64_t>(static_cast<uint32_t>(node_id)) << 32) |
+           static_cast<uint32_t>(etype_id);
+}
+
+// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006).
+//
+// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside
+// this object.  The distributed neighbor fetch is kept in Python because it
+// involves async RPC calls that C++ cannot drive directly.
+//
+// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache.
+// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors).
+//
+// Typical call sequence per batch:
+//   1.  PPRForwardPushState(seed_nodes, ...)   — init per-seed residuals / queue
+//   while True:
+//   2.  drain_queue()                          — drain queue → nodes needing lookup
+//   3.  <Python: _batch_fetch_neighbors(...)>  — distributed RPC fetch (stays in Python)
+//   4.  push_residuals(fetched_by_etype_id)    — push residuals, update queue
+//   5.  extract_top_k(max_ppr_nodes)           — top-k selection per seed per node type
+class PPRForwardPushState {
+   public:
+    PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
+                        double requeue_threshold_factor,
+                        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
+                        std::vector<int32_t> edge_type_to_dst_ntype_id,
+                        std::vector<torch::Tensor> degree_tensors);
+
+    // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch
+    // neighbor lookup.  Also snapshots the drained nodes into queued_nodes_ for
+    // use by push_residuals().
+    //
+    // Return value semantics:
+    //   - std::nullopt   → queue was already empty; convergence achieved; stop the loop.
+    //   - empty map      → nodes were drained but all were cached; call push_residuals({}).
+    //   - non-empty map  → {etype_id → 1-D int64 tensor of node IDs} needing neighbor lookup.
+    std::optional<std::unordered_map<int32_t, torch::Tensor>> drain_queue();
+
+    // Push residuals to neighbors given the fetched neighbor data.
+    //
+    // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
+    //   - node_ids_tensor:  [N]           int64 — source node IDs fetched for this edge type
+    //   - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat
+    //   - counts_tensor:    [N]           int64 — neighbor count for each source node
+    void push_residuals(const std::unordered_map<
+                        int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
+                            fetched_by_etype_id);
+
+    // Extract top-k PPR nodes per seed per node type.
+    //
+    // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}.
+    // Only node types that received any PPR score are included in the output.
+    //
+    // Output layout for a batch of B seeds:
+    //   flat_ids[0 : valid_counts[0]]                 → top-k nodes for seed 0
+    //   flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1
+    //   ...
+    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+    extract_top_k(int32_t max_ppr_nodes);
+
+   private:
+    // Look up the total (across all edge types) out-degree of a node.
+    // Returns 0 for destination-only node types (no outgoing edges).
+    int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const;
+
+    // -------------------------------------------------------------------------
+    // Scalar algorithm parameters
+    // -------------------------------------------------------------------------
+    double alpha_;            // Restart probability
+    double one_minus_alpha_;  // 1 - alpha, precomputed to avoid repeated subtraction
+    double requeue_threshold_factor_;  // alpha * eps; multiplied by degree to get per-node threshold
+
+    int32_t batch_size_;             // Number of seeds in the current batch
+    int32_t num_node_types_;         // Total number of node types (homo + hetero)
+    int32_t num_nodes_in_queue_{0};  // Running count of nodes across all seeds / types
+
+    // -------------------------------------------------------------------------
+    // Graph structure (read-only after construction)
+    // -------------------------------------------------------------------------
+    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
+    std::vector<int32_t> edge_type_to_dst_ntype_id_;
+    std::vector<torch::Tensor> degree_tensors_;
+
+    // -------------------------------------------------------------------------
+    // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id])
+    // -------------------------------------------------------------------------
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> ppr_scores_;
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
+    std::vector<std::vector<std::unordered_set<int32_t>>> queue_;
+    std::vector<std::vector<std::unordered_set<int32_t>>> queued_nodes_;
+
+    // -------------------------------------------------------------------------
+    // Neighbor cache
+    // -------------------------------------------------------------------------
+    std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
+};