diff --git a/internal/cnpgi/common/check.go b/internal/cnpgi/common/check.go index 8cf9dd2a..719b0e5e 100644 --- a/internal/cnpgi/common/check.go +++ b/internal/cnpgi/common/check.go @@ -28,21 +28,33 @@ import ( ) // CheckBackupDestination checks if the backup destination is suitable -// to archive WALs +// to archive WALs. +// +// The timeline parameter, when > 0, is passed to +// barman-cloud-check-wal-archive via --timeline so that WAL from earlier +// timelines (expected after a failover) does not cause the check to fail. func CheckBackupDestination( ctx context.Context, barmanConfiguration *cnpgv1.BarmanObjectStoreConfiguration, barmanArchiver *archiver.WALArchiver, serverName string, + timeline int, ) error { contextLogger := log.FromContext(ctx) contextLogger.Info( - "Checking backup destination with barman-cloud-wal-archive", - "serverName", serverName) + "Checking backup destination with barman-cloud-check-wal-archive", + "serverName", serverName, + "timeline", timeline) + + // Build options, passing --timeline when available so that WAL from + // earlier timelines in the archive is accepted after a failover. + var opts []archiver.CheckWalArchiveOption + if timeline > 0 { + opts = append(opts, archiver.WithTimeline(timeline)) + } - // Get WAL archive options checkWalOptions, err := barmanArchiver.BarmanCloudCheckWalArchiveOptions( - ctx, barmanConfiguration, serverName) + ctx, barmanConfiguration, serverName, opts...) if err != nil { log.Error(err, "while getting barman-cloud-wal-archive options") return err diff --git a/internal/cnpgi/common/timeline.go b/internal/cnpgi/common/timeline.go new file mode 100644 index 00000000..7637874e --- /dev/null +++ b/internal/cnpgi/common/timeline.go @@ -0,0 +1,92 @@ +/* +Copyright © contributors to CloudNativePG, established as +CloudNativePG a Series of LF Projects, LLC. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +*/ + +package common + +import ( + "context" + "fmt" + "os/exec" + "regexp" + "strconv" + "strings" + + "github.com/cloudnative-pg/machinery/pkg/log" +) + +var timelineRe = regexp.MustCompile(`Latest checkpoint's TimeLineID:\s+(\d+)`) + +// currentTimeline returns the server's current PostgreSQL timeline by +// parsing pg_controldata output. +// +// This is reliable for the promotion case because PostgreSQL performs a +// synchronous end-of-recovery checkpoint (which updates the control file) +// before the server starts accepting connections and before the WAL +// archiver is signaled. By the time this function is called during the +// first WAL archive attempt, the control file reflects the promoted +// timeline. +// +// Returns an error if the timeline cannot be determined. Callers must NOT +// silently fall back to omitting --timeline, as that reintroduces the +// original "Expected empty archive" bug after failover. +func currentTimeline(ctx context.Context, pgDataPath string) (int, error) { + contextLogger := log.FromContext(ctx) + + cmd := exec.CommandContext(ctx, "pg_controldata", pgDataPath) // #nosec G204 + cmd.Env = append(cmd.Environ(), "LC_ALL=C") + out, err := cmd.Output() + if err != nil { + return 0, fmt.Errorf( + "pg_controldata exec failed (PGDATA=%s): %w; "+ + "WAL archive check cannot run safely without a timeline — "+ + "set annotation cnpg.io/skipEmptyWalArchiveCheck=enabled "+ + "as a manual workaround", + pgDataPath, err) + } + + tl, err := parseTimelineIDFromPgControldataOutput(string(out), pgDataPath) + if err != nil { + return 0, err + } + + contextLogger.Info("Detected PostgreSQL timeline from pg_controldata", + "timeline", tl) + return tl, nil +} + +// parseTimelineIDFromPgControldataOutput extracts Latest checkpoint's TimeLineID +// from pg_controldata stdout. pgDataPath is used only in error messages. +func parseTimelineIDFromPgControldataOutput(out string, pgDataPath string) (int, error) { + matches := timelineRe.FindStringSubmatch(out) + if len(matches) < 2 { + return 0, fmt.Errorf( + "could not parse TimeLineID from pg_controldata output "+ + "(PGDATA=%s); set annotation "+ + "cnpg.io/skipEmptyWalArchiveCheck=enabled as a manual "+ + "workaround", + pgDataPath) + } + + tl, err := strconv.Atoi(strings.TrimSpace(matches[1])) + if err != nil { + return 0, fmt.Errorf("parse timeline %q: %w", matches[1], err) + } + + return tl, nil +} diff --git a/internal/cnpgi/common/timeline_test.go b/internal/cnpgi/common/timeline_test.go new file mode 100644 index 00000000..8c0f4209 --- /dev/null +++ b/internal/cnpgi/common/timeline_test.go @@ -0,0 +1,99 @@ +/* +Copyright © contributors to CloudNativePG, established as +CloudNativePG a Series of LF Projects, LLC. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +*/ + +package common + +import ( + "strings" + "testing" +) + +func TestParseTimelineIDFromPgControldataOutput(t *testing.T) { + pgData := "/var/lib/postgresql/data/pgdata" + + tests := []struct { + name string + out string + want int + wantErr bool + errHasPGData bool // if true, error must mention pgData path (parse-not-found cases) + }{ + { + name: "typical_pg_controldata_snippet", + out: ` +Database cluster state: in production +Latest checkpoint location: 0/3000028 +Latest checkpoint's REDO location: 0/3000028 +Latest checkpoint's TimeLineID: 2 +Latest checkpoint's REDO WAL file: 000000010000000000000003 +`, + want: 2, + wantErr: false, + }, + { + name: "timeline_one", + out: `Latest checkpoint's TimeLineID: 1 +`, + want: 1, + wantErr: false, + }, + { + name: "missing_timeline_line", + out: "Database cluster state: in production\n", + want: 0, + wantErr: true, + errHasPGData: true, + }, + { + name: "empty", + out: "", + want: 0, + wantErr: true, + errHasPGData: true, + }, + { + name: "overflow_timeline", + out: `Latest checkpoint's TimeLineID: 999999999999999999999999999999999999 +`, + want: 0, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := parseTimelineIDFromPgControldataOutput(tt.out, pgData) + if tt.wantErr { + if err == nil { + t.Fatal("expected error") + } + if tt.errHasPGData && !strings.Contains(err.Error(), pgData) { + t.Errorf("error should mention PGDATA path: %v", err) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tt.want { + t.Errorf("got %d, want %d", got, tt.want) + } + }) + } +} diff --git a/internal/cnpgi/common/wal.go b/internal/cnpgi/common/wal.go index 8e58cb42..fe3abeb1 100644 --- a/internal/cnpgi/common/wal.go +++ b/internal/cnpgi/common/wal.go @@ -155,18 +155,34 @@ func (w WALServiceImplementation) Archive( return nil, err } - // Step 2: Check if the archive location is safe to perform archiving + // Step 2: Check if the archive location is safe to perform archiving. + // This is a one-time check gated by the .check-empty-wal-archive flag + // file, which is deleted after the first successful WAL archive. + // Timeline detection only runs here — steady-state archiving is + // completely unaffected. checkFileExisting, err := fileutils.FileExists(emptyWalArchiveFile) if err != nil { return nil, fmt.Errorf("while checking for empty wal archive check file %q: %w", emptyWalArchiveFile, err) } if utils.IsEmptyWalArchiveCheckEnabled(&configuration.Cluster.ObjectMeta) && checkFileExisting { + // Detect the server's current timeline from pg_controldata so + // barman-cloud-check-wal-archive can tolerate WAL from earlier + // timelines in the archive (expected after a failover). + timeline, err := currentTimeline(ctx, w.PGDataPath) + if err != nil { + contextLogger.Error(err, + "Cannot determine PostgreSQL timeline for WAL archive check; "+ + "archive attempt will be retried by PostgreSQL") + return nil, err + } + if err := CheckBackupDestination( ctx, &objectStore.Spec.Configuration, arch, configuration.ServerName, + timeline, ); err != nil { return nil, err } diff --git a/internal/cnpgi/restore/restore.go b/internal/cnpgi/restore/restore.go index 0aad1e55..ff3405db 100644 --- a/internal/cnpgi/restore/restore.go +++ b/internal/cnpgi/restore/restore.go @@ -285,9 +285,11 @@ func (impl *JobHookImpl) checkBackupDestination( } } - // Check if we're ok to archive in the desired destination + // Check if we're ok to archive in the desired destination. + // During restore/bootstrap, timeline is 0 (omit --timeline) so the + // check remains strict — the archive must be empty. if utils.IsEmptyWalArchiveCheckEnabled(&cluster.ObjectMeta) { - return common.CheckBackupDestination(ctx, barmanConfiguration, walArchiver, serverName) + return common.CheckBackupDestination(ctx, barmanConfiguration, walArchiver, serverName, 0) } return nil