fix(dbt): pick latest run_results entry when duplicate unique_ids exist (#25996)

thiagodeschamps · cursoragent · TeddyCr · harshsoni2024 · commit 1044f0818eaa · 2026-02-23T10:18:43.000+05:30
When multiple run_results files are present (e.g. split by domain or
from hourly partial runs), the same unique_id can appear in more than
one file. Previously, `add_dbt_tests` used `next()` which picked the
first match — the order being non-deterministic (depends on S3 listing).

This change introduces `_get_latest_result` which collects all matches
and returns the one with the most recent `execute` completed_at
timestamp. Falls back to the first match if timestamps are unavailable.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
Co-authored-by: Teddy &lt;teddy.crepineau@gmail.com&gt;
diff --git a/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py b/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py
@@ -656,6 +656,45 @@ def yield_dbt_tags(
                     )
                 )
 
+    @staticmethod
+    def _get_latest_result(dbt_objects: DbtObjects, key: str):
+        """
+        When multiple run_results files are present (e.g. split by domain),
+        the same unique_id may appear in more than one file.  Return the
+        result with the most recent ``execute`` completed_at timestamp so
+        that OpenMetadata always reflects the latest test state.
+        """
+        matches = [
+            item
+            for run_result in dbt_objects.dbt_run_results
+            for item in run_result.results
+            if item.unique_id == key
+        ]
+        if not matches:
+            return None
+        if len(matches) == 1:
+            return matches[0]
+
+        def _execute_completed_at(result):
+            for timing in result.timing or []:
+                if timing.name == "execute" and timing.completed_at:
+                    completed = timing.completed_at
+                    if isinstance(completed, str):
+                        try:
+                            return datetime.strptime(
+                                completed, DBT_RUN_RESULT_DATE_FORMAT
+                            )
+                        except ValueError:
+                            return None
+                    return completed
+            return None
+
+        timestamped = [(r, _execute_completed_at(r)) for r in matches]
+        with_ts = [(r, ts) for r, ts in timestamped if ts is not None]
+        if with_ts:
+            return max(with_ts, key=lambda pair: pair[1])[0]
+        return matches[0]
+
     def add_dbt_tests(
         self, key: str, manifest_node, manifest_entities, dbt_objects: DbtObjects
     ) -> None:
@@ -668,15 +707,9 @@ def add_dbt_tests(
         self.context.get().dbt_tests[key][
             DbtCommonEnum.UPSTREAM.value
         ] = self.parse_upstream_nodes(manifest_entities, manifest_node)
-        self.context.get().dbt_tests[key][DbtCommonEnum.RESULTS.value] = next(
-            (
-                item
-                for run_result in dbt_objects.dbt_run_results
-                for item in run_result.results
-                if item.unique_id == key
-            ),
-            None,
-        )
+        self.context.get().dbt_tests[key][
+            DbtCommonEnum.RESULTS.value
+        ] = self._get_latest_result(dbt_objects, key)
 
     def add_dbt_exposure(self, key: str, manifest_node, manifest_entities):
         exposure_entity = self.parse_exposure_node(manifest_node)
diff --git a/ingestion/tests/unit/test_dbt.py b/ingestion/tests/unit/test_dbt.py
@@ -2585,3 +2585,104 @@ def test_download_dbt_files_with_all_artifacts(self):
             self.assertIsNotNone(result[0].dbt_catalog)
             self.assertIsNotNone(result[0].dbt_run_results)
             self.assertIsNotNone(result[0].dbt_sources)
+
+
+class TestGetLatestResult(TestCase):
+    """
+    Test _get_latest_result picks the most recent result by execute
+    completed_at when the same unique_id appears in multiple run_results files.
+    """
+
+    @staticmethod
+    def _make_result(unique_id, completed_at, status="pass"):
+        timing = MagicMock()
+        timing.name = "execute"
+        timing.completed_at = completed_at
+        result = MagicMock()
+        result.unique_id = unique_id
+        result.timing = [timing]
+        result.status = MagicMock(value=status)
+        return result
+
+    @staticmethod
+    def _make_dbt_objects(run_results_list):
+        run_results = []
+        for results in run_results_list:
+            rr = MagicMock()
+            rr.results = results
+            run_results.append(rr)
+        dbt_objects = MagicMock()
+        dbt_objects.dbt_run_results = run_results
+        return dbt_objects
+
+    def test_single_match_returned(self):
+        from metadata.ingestion.source.database.dbt.metadata import DbtSource
+
+        result_a = self._make_result("test.pkg.my_test", "2026-02-12T10:00:00.000000Z")
+        dbt_objects = self._make_dbt_objects([[result_a]])
+
+        got = DbtSource._get_latest_result(dbt_objects, "test.pkg.my_test")
+        self.assertIs(got, result_a)
+
+    def test_no_match_returns_none(self):
+        from metadata.ingestion.source.database.dbt.metadata import DbtSource
+
+        result_a = self._make_result("test.pkg.other", "2026-02-12T10:00:00.000000Z")
+        dbt_objects = self._make_dbt_objects([[result_a]])
+
+        got = DbtSource._get_latest_result(dbt_objects, "test.pkg.missing")
+        self.assertIsNone(got)
+
+    def test_picks_latest_across_files(self):
+        from metadata.ingestion.source.database.dbt.metadata import DbtSource
+
+        old_result = self._make_result(
+            "test.pkg.my_test", "2026-02-12T10:00:00.000000Z", "pass"
+        )
+        new_result = self._make_result(
+            "test.pkg.my_test", "2026-02-12T14:00:00.000000Z", "fail"
+        )
+        dbt_objects = self._make_dbt_objects([[old_result], [new_result]])
+
+        got = DbtSource._get_latest_result(dbt_objects, "test.pkg.my_test")
+        self.assertIs(got, new_result)
+
+    def test_picks_latest_regardless_of_order(self):
+        from metadata.ingestion.source.database.dbt.metadata import DbtSource
+
+        new_result = self._make_result(
+            "test.pkg.my_test", "2026-02-12T14:00:00.000000Z", "fail"
+        )
+        old_result = self._make_result(
+            "test.pkg.my_test", "2026-02-12T10:00:00.000000Z", "pass"
+        )
+        dbt_objects = self._make_dbt_objects([[new_result], [old_result]])
+
+        got = DbtSource._get_latest_result(dbt_objects, "test.pkg.my_test")
+        self.assertIs(got, new_result)
+
+    def test_falls_back_to_first_when_no_timestamps(self):
+        from metadata.ingestion.source.database.dbt.metadata import DbtSource
+
+        result_a = self._make_result("test.pkg.my_test", None, "pass")
+        result_b = self._make_result("test.pkg.my_test", None, "fail")
+        dbt_objects = self._make_dbt_objects([[result_a], [result_b]])
+
+        got = DbtSource._get_latest_result(dbt_objects, "test.pkg.my_test")
+        self.assertIs(got, result_a)
+
+    def test_datetime_objects_handled(self):
+        from datetime import datetime
+
+        from metadata.ingestion.source.database.dbt.metadata import DbtSource
+
+        old_result = self._make_result(
+            "test.pkg.my_test", datetime(2026, 2, 12, 10, 0, 0), "pass"
+        )
+        new_result = self._make_result(
+            "test.pkg.my_test", datetime(2026, 2, 12, 14, 0, 0), "fail"
+        )
+        dbt_objects = self._make_dbt_objects([[old_result], [new_result]])
+
+        got = DbtSource._get_latest_result(dbt_objects, "test.pkg.my_test")
+        self.assertIs(got, new_result)