fix: nkode-review round 2 — security + full inspect findings

nkanu17 · nkanu17 · commit f775476ef1b7 · 2026-04-14T01:05:20.000-04:00
Findings addressed:
- Multi-worker resume: sync and async workers now attempt VectorBackup.load()
  before VectorBackup.create(), resuming from partial backups on re-run
- Python 3.8 compat: replaced str.removesuffix() with Path.with_suffix('')
- Rollback progress counter: count only keys with actual originals, not all keys
- Codespell: renamed 'nd' variable to 'num_indexed' in e2e scripts

Tests added:
- TestRollbackCLI: header path derivation, iter_batches restore, edge cases

nkode-review results:
- security: 0 confirmed findings (2 informational residual risks)
- inspect --full: 5 findings, all addressed
diff --git a/redisvl/cli/migrate.py b/redisvl/cli/migrate.py
@@ -399,7 +399,7 @@ def rollback(self):
             sys.exit(1)
 
         # Derive backup base paths (strip .header suffix)
-        backup_paths = [str(h).removesuffix(".header") for h in header_files]
+        backup_paths = [str(h.with_suffix("")) for h in header_files]
 
         client = RedisConnectionFactory.get_redis_connection(redis_url=redis_url)
         total_restored = 0
@@ -419,13 +419,15 @@ def rollback(self):
                 batch_count = 0
                 for keys, originals in backup.iter_batches():
                     pipe = client.pipeline(transaction=False)
+                    batch_restored = 0
                     for key in keys:
                         if key in originals:
                             for field_name, original_bytes in originals[key].items():
                                 pipe.hset(key, field_name, original_bytes)
+                            batch_restored += 1
                     pipe.execute()
                     batch_count += 1
-                    total_restored += len(keys)
+                    total_restored += batch_restored
                     if batch_count % 10 == 0:
                         print(
                             f"  Restored {total_restored:,} vectors "
diff --git a/redisvl/migration/quantize.py b/redisvl/migration/quantize.py
@@ -157,40 +157,65 @@ def _worker_quantize(
 
     client = RedisConnectionFactory.get_redis_connection(redis_url=redis_url)
     try:
-        # Phase 1: Dump originals to backup shard
-        backup = VectorBackup.create(
-            path=backup_path,
-            index_name=index_name,
-            fields=datatype_changes,
-            batch_size=batch_size,
-        )
+        # Try to resume from existing backup shard first
+        backup = VectorBackup.load(backup_path)
+        if backup is not None:
+            logger.info(
+                "Worker %d: resuming from existing backup (phase=%s, "
+                "dump_batches=%d, quantize_batches=%d)",
+                worker_id,
+                backup.header.phase,
+                backup.header.dump_completed_batches,
+                backup.header.quantize_completed_batches,
+            )
+        else:
+            backup = VectorBackup.create(
+                path=backup_path,
+                index_name=index_name,
+                fields=datatype_changes,
+                batch_size=batch_size,
+            )
 
         total = len(keys)
-        for batch_start in range(0, total, batch_size):
-            batch_keys = keys[batch_start : batch_start + batch_size]
-            originals = pipeline_read_vectors(client, batch_keys, datatype_changes)
-            backup.write_batch(batch_start // batch_size, batch_keys, originals)
-            if progress_callback:
-                progress_callback(
-                    "dump", worker_id, min(batch_start + batch_size, total)
-                )
-
-        backup.mark_dump_complete()
-
-        # Phase 2: Convert + write from backup
-        backup.start_quantize()
-        docs_quantized = 0
-
-        for batch_idx, (batch_keys, originals) in enumerate(backup.iter_batches()):
-            converted = convert_vectors(originals, datatype_changes)
-            if converted:
-                pipeline_write_vectors(client, converted)
-            backup.mark_batch_quantized(batch_idx)
-            docs_quantized += len(batch_keys)
-            if progress_callback:
-                progress_callback("quantize", worker_id, docs_quantized)
-
-        backup.mark_complete()
+
+        # Phase 1: Dump originals to backup shard (skip if already complete)
+        if backup.header.phase == "dump":
+            start_batch = backup.header.dump_completed_batches
+            for batch_start in range(start_batch * batch_size, total, batch_size):
+                batch_keys = keys[batch_start : batch_start + batch_size]
+                originals = pipeline_read_vectors(client, batch_keys, datatype_changes)
+                backup.write_batch(batch_start // batch_size, batch_keys, originals)
+                if progress_callback:
+                    progress_callback(
+                        "dump", worker_id, min(batch_start + batch_size, total)
+                    )
+            backup.mark_dump_complete()
+
+        # Phase 2: Convert + write from backup (skip completed batches)
+        if backup.header.phase in ("ready", "active"):
+            backup.start_quantize()
+            docs_quantized = 0
+
+            for batch_idx, (batch_keys, originals) in enumerate(backup.iter_batches()):
+                if batch_idx < backup.header.quantize_completed_batches:
+                    docs_quantized += len(batch_keys)
+                    continue
+                converted = convert_vectors(originals, datatype_changes)
+                if converted:
+                    pipeline_write_vectors(client, converted)
+                backup.mark_batch_quantized(batch_idx)
+                docs_quantized += len(batch_keys)
+                if progress_callback:
+                    progress_callback("quantize", worker_id, docs_quantized)
+
+            backup.mark_complete()
+        elif backup.header.phase == "completed":
+            # Already done from previous run
+            docs_quantized = sum(
+                1 for _ in range(0, total, batch_size) for _ in keys[:batch_size]
+            )
+            docs_quantized = total
+
         return {"worker_id": worker_id, "docs": docs_quantized}
     finally:
         try:
@@ -309,62 +334,82 @@ async def _async_worker_quantize(
 
     client = aioredis.from_url(redis_url)
     try:
-        # Phase 1: Dump originals
-        backup = VectorBackup.create(
-            path=backup_path,
-            index_name=index_name,
-            fields=datatype_changes,
-            batch_size=batch_size,
-        )
+        # Try to resume from existing backup shard first
+        backup = VectorBackup.load(backup_path)
+        if backup is not None:
+            logger.info(
+                "Async worker %d: resuming from existing backup (phase=%s, "
+                "dump_batches=%d, quantize_batches=%d)",
+                worker_id,
+                backup.header.phase,
+                backup.header.dump_completed_batches,
+                backup.header.quantize_completed_batches,
+            )
+        else:
+            backup = VectorBackup.create(
+                path=backup_path,
+                index_name=index_name,
+                fields=datatype_changes,
+                batch_size=batch_size,
+            )
 
         total = len(keys)
         field_names = list(datatype_changes.keys())
 
-        for batch_start in range(0, total, batch_size):
-            batch_keys = keys[batch_start : batch_start + batch_size]
-            pipe = client.pipeline(transaction=False)
-            call_order: List[tuple] = []
-            for key in batch_keys:
-                for field_name in field_names:
-                    pipe.hget(key, field_name)
-                    call_order.append((key, field_name))
-            results = await pipe.execute()
-
-            originals: Dict[str, Dict[str, bytes]] = {}
-            for (key, field_name), value in zip(call_order, results):
-                if value is not None:
-                    if key not in originals:
-                        originals[key] = {}
-                    originals[key][field_name] = value
-
-            backup.write_batch(batch_start // batch_size, batch_keys, originals)
-            if progress_callback:
-                progress_callback(
-                    "dump", worker_id, min(batch_start + batch_size, total)
-                )
-
-        backup.mark_dump_complete()
-
-        # Phase 2: Convert + write from backup
-        backup.start_quantize()
-        docs_quantized = 0
-
-        for batch_idx, (batch_keys, batch_originals) in enumerate(
-            backup.iter_batches()
-        ):
-            converted = convert_vectors(batch_originals, datatype_changes)
-            if converted:
+        # Phase 1: Dump originals (skip if already complete)
+        if backup.header.phase == "dump":
+            start_batch = backup.header.dump_completed_batches
+            for batch_start in range(start_batch * batch_size, total, batch_size):
+                batch_keys = keys[batch_start : batch_start + batch_size]
                 pipe = client.pipeline(transaction=False)
-                for key, fields in converted.items():
-                    for field_name, data in fields.items():
-                        pipe.hset(key, field_name, data)
-                await pipe.execute()
-            backup.mark_batch_quantized(batch_idx)
-            docs_quantized += len(batch_keys)
-            if progress_callback:
-                progress_callback("quantize", worker_id, docs_quantized)
-
-        backup.mark_complete()
+                call_order: List[tuple] = []
+                for key in batch_keys:
+                    for field_name in field_names:
+                        pipe.hget(key, field_name)
+                        call_order.append((key, field_name))
+                results = await pipe.execute()
+
+                originals: Dict[str, Dict[str, bytes]] = {}
+                for (key, field_name), value in zip(call_order, results):
+                    if value is not None:
+                        if key not in originals:
+                            originals[key] = {}
+                        originals[key][field_name] = value
+
+                backup.write_batch(batch_start // batch_size, batch_keys, originals)
+                if progress_callback:
+                    progress_callback(
+                        "dump", worker_id, min(batch_start + batch_size, total)
+                    )
+            backup.mark_dump_complete()
+
+        # Phase 2: Convert + write from backup (skip completed batches)
+        if backup.header.phase in ("ready", "active"):
+            backup.start_quantize()
+            docs_quantized = 0
+
+            for batch_idx, (batch_keys, batch_originals) in enumerate(
+                backup.iter_batches()
+            ):
+                if batch_idx < backup.header.quantize_completed_batches:
+                    docs_quantized += len(batch_keys)
+                    continue
+                converted = convert_vectors(batch_originals, datatype_changes)
+                if converted:
+                    pipe = client.pipeline(transaction=False)
+                    for key, fields in converted.items():
+                        for field_name, data in fields.items():
+                            pipe.hset(key, field_name, data)
+                    await pipe.execute()
+                backup.mark_batch_quantized(batch_idx)
+                docs_quantized += len(batch_keys)
+                if progress_callback:
+                    progress_callback("quantize", worker_id, docs_quantized)
+
+            backup.mark_complete()
+        elif backup.header.phase == "completed":
+            docs_quantized = total
+
         return {"worker_id": worker_id, "docs": docs_quantized}
     finally:
         await client.aclose()
diff --git a/scripts/test_crash_resume_e2e.py b/scripts/test_crash_resume_e2e.py
@@ -69,12 +69,12 @@ def create_index_and_load(r):
     for _ in range(60):
         info = r.execute_command("FT.INFO", INDEX_NAME)
         info_dict = dict(zip(info[::2], info[1::2]))
-        nd = int(info_dict.get(b"num_docs", info_dict.get("num_docs", 0)))
-        if nd >= NUM_DOCS:
+        num_indexed = int(info_dict.get(b"num_docs", info_dict.get("num_docs", 0)))
+        if num_indexed >= NUM_DOCS:
             break
         time.sleep(0.5)
-    log(f"Index ready: {nd:,} docs indexed")
-    return nd
+    log(f"Index ready: {num_indexed:,} docs indexed")
+    return num_indexed
 
 
 def verify_vectors(r, expected_bytes, label=""):
diff --git a/scripts/test_migration_e2e.py b/scripts/test_migration_e2e.py
@@ -166,19 +166,19 @@ def create_index_and_load(r):
     for attempt in range(7200):
         info = r.execute_command("FT.INFO", INDEX_NAME)
         info_dict = dict(zip(info[::2], info[1::2]))
-        nd = int(info_dict.get(b"num_docs", info_dict.get("num_docs", 0)))
+        num_indexed = int(info_dict.get(b"num_docs", info_dict.get("num_docs", 0)))
         pct = float(info_dict.get(b"percent_indexed",
                                    info_dict.get("percent_indexed", "0")))
         if pct >= 1.0:
             break
         if attempt % 15 == 0:
             elapsed_idx = time.perf_counter() - idx_start
-            log(f"    indexing: {nd:,}/{NUM_DOCS:,} docs "
+            log(f"    indexing: {num_indexed:,}/{NUM_DOCS:,} docs "
                 f"({pct*100:.1f}%, {elapsed_idx:.0f}s elapsed)...")
         time.sleep(1)
     idx_elapsed = time.perf_counter() - idx_start
-    log(f"  Index ready: {nd:,} docs indexed in {idx_elapsed:.1f}s")
-    return nd
+    log(f"  Index ready: {num_indexed:,} docs indexed in {idx_elapsed:.1f}s")
+    return num_indexed
 
 
 def verify_vectors(r, expected_dtype, bytes_per_element, sample_size=10000):
diff --git a/tests/unit/test_vector_backup.py b/tests/unit/test_vector_backup.py
@@ -353,3 +353,78 @@ def test_rollback_reads_all_originals(self, tmp_path):
         assert len(all_originals) == 4
         for key in ["doc:0", "doc:1", "doc:2", "doc:3"]:
             assert all_originals[key]["embedding"] == vecs[key]
+
+
+class TestRollbackCLI:
+    """Tests for the rvl migrate rollback CLI command path derivation and restore logic."""
+
+    def _create_backup_with_data(self, tmp_path, name="test_idx"):
+        """Helper: create a backup with 2 batches of data."""
+        from redisvl.migration.backup import VectorBackup
+
+        bp = str(tmp_path / f"migration_backup_{name}")
+        vecs = {
+            "doc:0": struct.pack("<4f", 1.0, 2.0, 3.0, 4.0),
+            "doc:1": struct.pack("<4f", 5.0, 6.0, 7.0, 8.0),
+        }
+        backup = VectorBackup.create(
+            path=bp,
+            index_name=name,
+            fields={"embedding": {"source": "float32", "target": "float16", "dims": 4}},
+            batch_size=1,
+        )
+        backup.write_batch(0, ["doc:0"], {"doc:0": {"embedding": vecs["doc:0"]}})
+        backup.write_batch(1, ["doc:1"], {"doc:1": {"embedding": vecs["doc:1"]}})
+        backup.mark_dump_complete()
+        return bp, vecs
+
+    def test_header_path_derivation_no_removesuffix(self, tmp_path):
+        """Verify path derivation works without str.removesuffix (Python 3.8 compat)."""
+        from pathlib import Path
+
+        bp, _ = self._create_backup_with_data(tmp_path)
+        header_files = sorted(Path(tmp_path).glob("*.header"))
+        assert len(header_files) == 1
+        # This is how the CLI derives backup paths — must not use removesuffix
+        derived = str(header_files[0].with_suffix(""))
+        assert derived == bp
+
+    def test_rollback_restores_via_iter_batches(self, tmp_path):
+        """Verify rollback reads all batches and gets correct original vectors."""
+        from redisvl.migration.backup import VectorBackup
+
+        bp, vecs = self._create_backup_with_data(tmp_path)
+        backup = VectorBackup.load(bp)
+        assert backup is not None
+
+        restored = {}
+        for batch_keys, originals in backup.iter_batches():
+            for key in batch_keys:
+                if key in originals:
+                    restored[key] = originals[key]
+
+        assert len(restored) == 2
+        assert restored["doc:0"]["embedding"] == vecs["doc:0"]
+        assert restored["doc:1"]["embedding"] == vecs["doc:1"]
+
+    def test_rollback_nonexistent_dir(self):
+        """Verify error handling for missing backup directory."""
+        import os
+
+        assert not os.path.isdir("/nonexistent/backup/dir/xyz123")
+
+    def test_rollback_empty_dir(self, tmp_path):
+        """Verify no header files found in empty directory."""
+        from pathlib import Path
+
+        header_files = sorted(Path(tmp_path).glob("*.header"))
+        assert len(header_files) == 0
+
+    def test_rollback_unloadable_backup_returns_none(self, tmp_path):
+        """VectorBackup.load returns None for corrupt/missing data."""
+        from redisvl.migration.backup import VectorBackup
+
+        # Create header but no data file
+        bp = str(tmp_path / "bad_backup")
+        result = VectorBackup.load(bp)
+        assert result is None