Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions internal/cnpgi/common/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,33 @@ import (
)

// CheckBackupDestination checks if the backup destination is suitable
// to archive WALs
// to archive WALs.
//
// The timeline parameter, when > 0, is passed to
// barman-cloud-check-wal-archive via --timeline so that WAL from earlier
// timelines (expected after a failover) does not cause the check to fail.
func CheckBackupDestination(
ctx context.Context,
barmanConfiguration *cnpgv1.BarmanObjectStoreConfiguration,
barmanArchiver *archiver.WALArchiver,
serverName string,
timeline int,
) error {
contextLogger := log.FromContext(ctx)
contextLogger.Info(
"Checking backup destination with barman-cloud-wal-archive",
"serverName", serverName)
"Checking backup destination with barman-cloud-check-wal-archive",
"serverName", serverName,
"timeline", timeline)

// Build options, passing --timeline when available so that WAL from
// earlier timelines in the archive is accepted after a failover.
var opts []archiver.CheckWalArchiveOption
if timeline > 0 {
opts = append(opts, archiver.WithTimeline(timeline))
}

// Get WAL archive options
checkWalOptions, err := barmanArchiver.BarmanCloudCheckWalArchiveOptions(
ctx, barmanConfiguration, serverName)
ctx, barmanConfiguration, serverName, opts...)
if err != nil {
log.Error(err, "while getting barman-cloud-wal-archive options")
return err
Expand Down
92 changes: 92 additions & 0 deletions internal/cnpgi/common/timeline.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
Copyright © contributors to CloudNativePG, established as
CloudNativePG a Series of LF Projects, LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

SPDX-License-Identifier: Apache-2.0
*/

package common

import (
"context"
"fmt"
"os/exec"
"regexp"
"strconv"
"strings"

"github.com/cloudnative-pg/machinery/pkg/log"
)

var timelineRe = regexp.MustCompile(`Latest checkpoint's TimeLineID:\s+(\d+)`)

// currentTimeline returns the server's current PostgreSQL timeline by
// parsing pg_controldata output.
//
// This is reliable for the promotion case because PostgreSQL performs a
// synchronous end-of-recovery checkpoint (which updates the control file)
// before the server starts accepting connections and before the WAL
// archiver is signaled. By the time this function is called during the
// first WAL archive attempt, the control file reflects the promoted
// timeline.
//
// Returns an error if the timeline cannot be determined. Callers must NOT
// silently fall back to omitting --timeline, as that reintroduces the
// original "Expected empty archive" bug after failover.
func currentTimeline(ctx context.Context, pgDataPath string) (int, error) {
contextLogger := log.FromContext(ctx)

cmd := exec.CommandContext(ctx, "pg_controldata", pgDataPath) // #nosec G204
cmd.Env = append(cmd.Environ(), "LC_ALL=C")
out, err := cmd.Output()
if err != nil {
return 0, fmt.Errorf(
"pg_controldata exec failed (PGDATA=%s): %w; "+
"WAL archive check cannot run safely without a timeline — "+
"set annotation cnpg.io/skipEmptyWalArchiveCheck=enabled "+
"as a manual workaround",
pgDataPath, err)
}

tl, err := parseTimelineIDFromPgControldataOutput(string(out), pgDataPath)
if err != nil {
return 0, err
}

contextLogger.Info("Detected PostgreSQL timeline from pg_controldata",
"timeline", tl)
return tl, nil
}

// parseTimelineIDFromPgControldataOutput extracts Latest checkpoint's TimeLineID
// from pg_controldata stdout. pgDataPath is used only in error messages.
func parseTimelineIDFromPgControldataOutput(out string, pgDataPath string) (int, error) {
matches := timelineRe.FindStringSubmatch(out)
if len(matches) < 2 {
return 0, fmt.Errorf(
"could not parse TimeLineID from pg_controldata output "+
"(PGDATA=%s); set annotation "+
"cnpg.io/skipEmptyWalArchiveCheck=enabled as a manual "+
"workaround",
pgDataPath)
}

tl, err := strconv.Atoi(strings.TrimSpace(matches[1]))
if err != nil {
return 0, fmt.Errorf("parse timeline %q: %w", matches[1], err)
}

return tl, nil
}
99 changes: 99 additions & 0 deletions internal/cnpgi/common/timeline_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
Copyright © contributors to CloudNativePG, established as
CloudNativePG a Series of LF Projects, LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

SPDX-License-Identifier: Apache-2.0
*/

package common

import (
"strings"
"testing"
)

func TestParseTimelineIDFromPgControldataOutput(t *testing.T) {
pgData := "/var/lib/postgresql/data/pgdata"

tests := []struct {
name string
out string
want int
wantErr bool
errHasPGData bool // if true, error must mention pgData path (parse-not-found cases)
}{
{
name: "typical_pg_controldata_snippet",
out: `
Database cluster state: in production
Latest checkpoint location: 0/3000028
Latest checkpoint's REDO location: 0/3000028
Latest checkpoint's TimeLineID: 2
Latest checkpoint's REDO WAL file: 000000010000000000000003
`,
want: 2,
wantErr: false,
},
{
name: "timeline_one",
out: `Latest checkpoint's TimeLineID: 1
`,
want: 1,
wantErr: false,
},
{
name: "missing_timeline_line",
out: "Database cluster state: in production\n",
want: 0,
wantErr: true,
errHasPGData: true,
},
{
name: "empty",
out: "",
want: 0,
wantErr: true,
errHasPGData: true,
},
{
name: "overflow_timeline",
out: `Latest checkpoint's TimeLineID: 999999999999999999999999999999999999
`,
want: 0,
wantErr: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := parseTimelineIDFromPgControldataOutput(tt.out, pgData)
if tt.wantErr {
if err == nil {
t.Fatal("expected error")
}
if tt.errHasPGData && !strings.Contains(err.Error(), pgData) {
t.Errorf("error should mention PGDATA path: %v", err)
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != tt.want {
t.Errorf("got %d, want %d", got, tt.want)
}
})
}
}
18 changes: 17 additions & 1 deletion internal/cnpgi/common/wal.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,18 +155,34 @@ func (w WALServiceImplementation) Archive(
return nil, err
}

// Step 2: Check if the archive location is safe to perform archiving
// Step 2: Check if the archive location is safe to perform archiving.
// This is a one-time check gated by the .check-empty-wal-archive flag
// file, which is deleted after the first successful WAL archive.
// Timeline detection only runs here — steady-state archiving is
// completely unaffected.
checkFileExisting, err := fileutils.FileExists(emptyWalArchiveFile)
if err != nil {
return nil, fmt.Errorf("while checking for empty wal archive check file %q: %w", emptyWalArchiveFile, err)
}

if utils.IsEmptyWalArchiveCheckEnabled(&configuration.Cluster.ObjectMeta) && checkFileExisting {
// Detect the server's current timeline from pg_controldata so
// barman-cloud-check-wal-archive can tolerate WAL from earlier
// timelines in the archive (expected after a failover).
timeline, err := currentTimeline(ctx, w.PGDataPath)
if err != nil {
contextLogger.Error(err,
"Cannot determine PostgreSQL timeline for WAL archive check; "+
"archive attempt will be retried by PostgreSQL")
return nil, err
}

if err := CheckBackupDestination(
ctx,
&objectStore.Spec.Configuration,
arch,
configuration.ServerName,
timeline,
); err != nil {
return nil, err
}
Expand Down
6 changes: 4 additions & 2 deletions internal/cnpgi/restore/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,11 @@ func (impl *JobHookImpl) checkBackupDestination(
}
}

// Check if we're ok to archive in the desired destination
// Check if we're ok to archive in the desired destination.
// During restore/bootstrap, timeline is 0 (omit --timeline) so the
// check remains strict — the archive must be empty.
if utils.IsEmptyWalArchiveCheckEnabled(&cluster.ObjectMeta) {
return common.CheckBackupDestination(ctx, barmanConfiguration, walArchiver, serverName)
return common.CheckBackupDestination(ctx, barmanConfiguration, walArchiver, serverName, 0)
}

return nil
Expand Down
Loading