Skip to content

Commit a1f5bdf

Browse files
committed
fix: pass --timeline to barman-cloud-check-wal-archive after failover
After a failover/promotion, WAL archiving permanently breaks because barman-cloud-check-wal-archive rejects the archive with "Expected empty archive" -- it finds WAL from the previous timeline and treats it as an error. Detect the server's current timeline from pg_controldata (reliable because PostgreSQL's end-of-recovery checkpoint updates the control file before the archiver starts) and pass it via --timeline to the check command. WAL from earlier timelines is then accepted as expected, while WAL from the current timeline in the archive still correctly triggers the safety check. Timeline detection is scoped strictly inside the one-time empty-archive check gate (.check-empty-wal-archive flag file). Steady-state WAL archiving is completely unaffected. During restore/bootstrap, timeline 0 is passed so the check remains strict (archive must be empty). Fixes: #842 Related: #828 Assisted-by: Claude Opus 4.6 Assisted-by: GPT-5.4 in Cursor
1 parent e973d8e commit a1f5bdf

6 files changed

Lines changed: 230 additions & 8 deletions

File tree

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,4 @@ require (
140140
sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect
141141
sigs.k8s.io/yaml v1.6.0 // indirect
142142
)
143+

internal/cnpgi/common/check.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,33 @@ import (
2828
)
2929

3030
// CheckBackupDestination checks if the backup destination is suitable
31-
// to archive WALs
31+
// to archive WALs.
32+
//
33+
// The timeline parameter, when > 0, is passed to
34+
// barman-cloud-check-wal-archive via --timeline so that WAL from earlier
35+
// timelines (expected after a failover) does not cause the check to fail.
3236
func CheckBackupDestination(
3337
ctx context.Context,
3438
barmanConfiguration *cnpgv1.BarmanObjectStoreConfiguration,
3539
barmanArchiver *archiver.WALArchiver,
3640
serverName string,
41+
timeline int,
3742
) error {
3843
contextLogger := log.FromContext(ctx)
3944
contextLogger.Info(
40-
"Checking backup destination with barman-cloud-wal-archive",
41-
"serverName", serverName)
45+
"Checking backup destination with barman-cloud-check-wal-archive",
46+
"serverName", serverName,
47+
"timeline", timeline)
48+
49+
// Build options, passing --timeline when available so that WAL from
50+
// earlier timelines in the archive is accepted after a failover.
51+
var opts []archiver.CheckWalArchiveOption
52+
if timeline > 0 {
53+
opts = append(opts, archiver.WithTimeline(timeline))
54+
}
4255

43-
// Get WAL archive options
4456
checkWalOptions, err := barmanArchiver.BarmanCloudCheckWalArchiveOptions(
45-
ctx, barmanConfiguration, serverName)
57+
ctx, barmanConfiguration, serverName, opts...)
4658
if err != nil {
4759
log.Error(err, "while getting barman-cloud-wal-archive options")
4860
return err

internal/cnpgi/common/timeline.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
Copyright © contributors to CloudNativePG, established as
3+
CloudNativePG a Series of LF Projects, LLC.
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
17+
SPDX-License-Identifier: Apache-2.0
18+
*/
19+
20+
package common
21+
22+
import (
23+
"context"
24+
"fmt"
25+
"os/exec"
26+
"regexp"
27+
"strconv"
28+
"strings"
29+
30+
"github.com/cloudnative-pg/machinery/pkg/log"
31+
)
32+
33+
var timelineRe = regexp.MustCompile(`Latest checkpoint's TimeLineID:\s+(\d+)`)
34+
35+
// currentTimeline returns the server's current PostgreSQL timeline by
36+
// parsing pg_controldata output.
37+
//
38+
// This is reliable for the promotion case because PostgreSQL performs a
39+
// synchronous end-of-recovery checkpoint (which updates the control file)
40+
// before the server starts accepting connections and before the WAL
41+
// archiver is signaled. By the time this function is called during the
42+
// first WAL archive attempt, the control file reflects the promoted
43+
// timeline.
44+
//
45+
// Returns an error if the timeline cannot be determined. Callers must NOT
46+
// silently fall back to omitting --timeline, as that reintroduces the
47+
// original "Expected empty archive" bug after failover.
48+
func currentTimeline(ctx context.Context, pgDataPath string) (int, error) {
49+
contextLogger := log.FromContext(ctx)
50+
51+
cmd := exec.CommandContext(ctx, "pg_controldata", pgDataPath) // #nosec G204
52+
cmd.Env = append(cmd.Environ(), "LC_ALL=C")
53+
out, err := cmd.Output()
54+
if err != nil {
55+
return 0, fmt.Errorf(
56+
"pg_controldata exec failed (PGDATA=%s): %w; "+
57+
"WAL archive check cannot run safely without a timeline — "+
58+
"set annotation cnpg.io/skipEmptyWalArchiveCheck=enabled "+
59+
"as a manual workaround",
60+
pgDataPath, err)
61+
}
62+
63+
tl, err := parseTimelineIDFromPgControldataOutput(string(out), pgDataPath)
64+
if err != nil {
65+
return 0, err
66+
}
67+
68+
contextLogger.Info("Detected PostgreSQL timeline from pg_controldata",
69+
"timeline", tl)
70+
return tl, nil
71+
}
72+
73+
// parseTimelineIDFromPgControldataOutput extracts Latest checkpoint's TimeLineID
74+
// from pg_controldata stdout. pgDataPath is used only in error messages.
75+
func parseTimelineIDFromPgControldataOutput(out string, pgDataPath string) (int, error) {
76+
matches := timelineRe.FindStringSubmatch(out)
77+
if len(matches) < 2 {
78+
return 0, fmt.Errorf(
79+
"could not parse TimeLineID from pg_controldata output "+
80+
"(PGDATA=%s); set annotation "+
81+
"cnpg.io/skipEmptyWalArchiveCheck=enabled as a manual "+
82+
"workaround",
83+
pgDataPath)
84+
}
85+
86+
tl, err := strconv.Atoi(strings.TrimSpace(matches[1]))
87+
if err != nil {
88+
return 0, fmt.Errorf("parse timeline %q: %w", matches[1], err)
89+
}
90+
91+
return tl, nil
92+
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
Copyright © contributors to CloudNativePG, established as
3+
CloudNativePG a Series of LF Projects, LLC.
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
17+
SPDX-License-Identifier: Apache-2.0
18+
*/
19+
20+
package common
21+
22+
import (
23+
"strings"
24+
"testing"
25+
)
26+
27+
func TestParseTimelineIDFromPgControldataOutput(t *testing.T) {
28+
pgData := "/var/lib/postgresql/data/pgdata"
29+
30+
tests := []struct {
31+
name string
32+
out string
33+
want int
34+
wantErr bool
35+
errHasPGData bool // if true, error must mention pgData path (parse-not-found cases)
36+
}{
37+
{
38+
name: "typical_pg_controldata_snippet",
39+
out: `
40+
Database cluster state: in production
41+
Latest checkpoint location: 0/3000028
42+
Latest checkpoint's REDO location: 0/3000028
43+
Latest checkpoint's TimeLineID: 2
44+
Latest checkpoint's REDO WAL file: 000000010000000000000003
45+
`,
46+
want: 2,
47+
wantErr: false,
48+
},
49+
{
50+
name: "timeline_one",
51+
out: `Latest checkpoint's TimeLineID: 1
52+
`,
53+
want: 1,
54+
wantErr: false,
55+
},
56+
{
57+
name: "missing_timeline_line",
58+
out: "Database cluster state: in production\n",
59+
want: 0,
60+
wantErr: true,
61+
errHasPGData: true,
62+
},
63+
{
64+
name: "empty",
65+
out: "",
66+
want: 0,
67+
wantErr: true,
68+
errHasPGData: true,
69+
},
70+
{
71+
name: "overflow_timeline",
72+
out: `Latest checkpoint's TimeLineID: 999999999999999999999999999999999999
73+
`,
74+
want: 0,
75+
wantErr: true,
76+
},
77+
}
78+
79+
for _, tt := range tests {
80+
t.Run(tt.name, func(t *testing.T) {
81+
got, err := parseTimelineIDFromPgControldataOutput(tt.out, pgData)
82+
if tt.wantErr {
83+
if err == nil {
84+
t.Fatal("expected error")
85+
}
86+
if tt.errHasPGData && !strings.Contains(err.Error(), pgData) {
87+
t.Errorf("error should mention PGDATA path: %v", err)
88+
}
89+
return
90+
}
91+
if err != nil {
92+
t.Fatalf("unexpected error: %v", err)
93+
}
94+
if got != tt.want {
95+
t.Errorf("got %d, want %d", got, tt.want)
96+
}
97+
})
98+
}
99+
}

internal/cnpgi/common/wal.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,18 +155,34 @@ func (w WALServiceImplementation) Archive(
155155
return nil, err
156156
}
157157

158-
// Step 2: Check if the archive location is safe to perform archiving
158+
// Step 2: Check if the archive location is safe to perform archiving.
159+
// This is a one-time check gated by the .check-empty-wal-archive flag
160+
// file, which is deleted after the first successful WAL archive.
161+
// Timeline detection only runs here — steady-state archiving is
162+
// completely unaffected.
159163
checkFileExisting, err := fileutils.FileExists(emptyWalArchiveFile)
160164
if err != nil {
161165
return nil, fmt.Errorf("while checking for empty wal archive check file %q: %w", emptyWalArchiveFile, err)
162166
}
163167

164168
if utils.IsEmptyWalArchiveCheckEnabled(&configuration.Cluster.ObjectMeta) && checkFileExisting {
169+
// Detect the server's current timeline from pg_controldata so
170+
// barman-cloud-check-wal-archive can tolerate WAL from earlier
171+
// timelines in the archive (expected after a failover).
172+
timeline, err := currentTimeline(ctx, w.PGDataPath)
173+
if err != nil {
174+
contextLogger.Error(err,
175+
"Cannot determine PostgreSQL timeline for WAL archive check; "+
176+
"archive attempt will be retried by PostgreSQL")
177+
return nil, err
178+
}
179+
165180
if err := CheckBackupDestination(
166181
ctx,
167182
&objectStore.Spec.Configuration,
168183
arch,
169184
configuration.ServerName,
185+
timeline,
170186
); err != nil {
171187
return nil, err
172188
}

internal/cnpgi/restore/restore.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,11 @@ func (impl *JobHookImpl) checkBackupDestination(
285285
}
286286
}
287287

288-
// Check if we're ok to archive in the desired destination
288+
// Check if we're ok to archive in the desired destination.
289+
// During restore/bootstrap, timeline is 0 (omit --timeline) so the
290+
// check remains strict — the archive must be empty.
289291
if utils.IsEmptyWalArchiveCheckEnabled(&cluster.ObjectMeta) {
290-
return common.CheckBackupDestination(ctx, barmanConfiguration, walArchiver, serverName)
292+
return common.CheckBackupDestination(ctx, barmanConfiguration, walArchiver, serverName, 0)
291293
}
292294

293295
return nil

0 commit comments

Comments
 (0)