diff --git a/contrib/pg_tde/meson.build b/contrib/pg_tde/meson.build index 42a6bbe6afff3..cbb656d410a26 100644 --- a/contrib/pg_tde/meson.build +++ b/contrib/pg_tde/meson.build @@ -120,6 +120,7 @@ tap_tests = [ 't/pg_waldump_fullpage.pl', 't/replication.pl', 't/rotate_key.pl', + 't/standby_source.pl', 't/tde_heap.pl', 't/unlogged_tables.pl', 't/wal_encrypt.pl', diff --git a/contrib/pg_tde/src/access/pg_tde_tdemap.c b/contrib/pg_tde/src/access/pg_tde_tdemap.c index e67d7169a51e0..5ac15b3f6d8d7 100644 --- a/contrib/pg_tde/src/access/pg_tde_tdemap.c +++ b/contrib/pg_tde/src/access/pg_tde_tdemap.c @@ -44,6 +44,7 @@ #define TDE_FILE_HEADER_SIZE sizeof(TDEFileHeader) #define MaxXLogRecPtr (~(XLogRecPtr)0) +#define MaxTimeLineID (~(TimeLineID)0) typedef struct TDEFileHeader { @@ -369,13 +370,19 @@ pg_tde_delete_principal_key(Oid dbOid) * needs keyfile_path */ void -pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path) +pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, TimeLineID tli, const char *keyfile_path) { LWLock *lock_pk = tde_lwlock_enc_keys(); int fd; off_t read_pos, write_pos, last_key_idx; + struct + { + XLogRecPtr start_lsn; + TimeLineID tli; + } lsn_tli; + LWLockAcquire(lock_pk, LW_EXCLUSIVE); @@ -384,7 +391,10 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path) last_key_idx = ((lseek(fd, 0, SEEK_END) - TDE_FILE_HEADER_SIZE) / MAP_ENTRY_SIZE) - 1; write_pos = TDE_FILE_HEADER_SIZE + (last_key_idx * MAP_ENTRY_SIZE) + offsetof(TDEMapEntry, enc_key) + offsetof(InternalKey, start_lsn); - if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr)) + lsn_tli.start_lsn = lsn; + lsn_tli.tli = tli; + + if (pg_pwrite(fd, &lsn_tli, sizeof(lsn_tli), write_pos) != sizeof(lsn_tli)) { ereport(ERROR, errcode_for_file_access(), @@ -408,7 +418,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path) errmsg("could not read previous WAL key: %m")); } - if (prev_map_entry.enc_key.start_lsn >= lsn) + if (prev_map_entry.enc_key.start_lsn >= lsn && prev_map_entry.enc_key.tli >= tli) { prev_map_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID; @@ -1071,6 +1081,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn) WALKeyCacheRec *wal_rec; InternalKey stub_key = { .start_lsn = InvalidXLogRecPtr, + .tli = 0, }; wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr); @@ -1132,8 +1143,10 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn) MemoryContextSwitchTo(oldCtx); #endif + wal_rec->start_tli = key->tli; wal_rec->start_lsn = start_lsn; wal_rec->end_lsn = MaxXLogRecPtr; + wal_rec->end_tli = MaxTimeLineID; wal_rec->key = *key; wal_rec->crypt_ctx = NULL; if (!tde_wal_key_last_rec) @@ -1145,6 +1158,7 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn) { tde_wal_key_last_rec->next = wal_rec; tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn; + tde_wal_key_last_rec->end_tli = wal_rec->start_tli; tde_wal_key_last_rec = wal_rec; } diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c index 3edef134b790d..0ad750c7a5402 100644 --- a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c +++ b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c @@ -67,6 +67,7 @@ typedef struct EncryptionStateData { char db_map_path[MAXPGPATH]; pg_atomic_uint64 enc_key_lsn; /* to sync with readers */ + pg_atomic_uint64 enc_key_tli; /* to sync with readers */ } EncryptionStateData; static EncryptionStateData *EncryptionState = NULL; @@ -85,6 +86,18 @@ TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn) pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn); } +static TimeLineID +TDEXLogGetEncKeyTli() +{ + return (TimeLineID) pg_atomic_read_u64(&EncryptionState->enc_key_tli); +} + +static void +TDEXLogSetEncKeyTli(TimeLineID tli) +{ + pg_atomic_write_u64(&EncryptionState->enc_key_tli, tli); +} + static Size TDEXLogEncryptBuffSize(void); static int XLOGChooseNumBuffers(void); @@ -159,6 +172,7 @@ TDEXLogShmemInit(void) } pg_atomic_init_u64(&EncryptionState->enc_key_lsn, 0); + pg_atomic_init_u64(&EncryptionState->enc_key_tli, 0); elog(DEBUG1, "pg_tde: initialized encryption buffer %lu bytes", TDEXLogEncryptStateSize()); } @@ -169,6 +183,7 @@ typedef struct EncryptionStateData { char db_map_path[MAXPGPATH]; XLogRecPtr enc_key_lsn; /* to sync with reader */ + XLogRecPtr enc_key_tli; /* to sync with reader */ } EncryptionStateData; static EncryptionStateData EncryptionStateD = {0}; @@ -186,7 +201,19 @@ TDEXLogGetEncKeyLsn() static void TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn) { - EncryptionState->enc_key_lsn = EncryptionKey.start_lsn; + EncryptionState->enc_key_lsn = start_lsn; +} + +static TimeLineID +TDEXLogGetEncKeyTli() +{ + return (TimeLineID) EncryptionState->enc_key_tli; +} + +static void +TDEXLogSetEncKeyTli(TimeLineID tli) +{ + EncryptionState->enc_key_lsn = tli; } #endif /* FRONTEND */ @@ -221,6 +248,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog) { EncryptionKey = *key; TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn); + TDEXLogSetEncKeyTli(EncryptionKey.tli); } if (key) @@ -245,8 +273,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset, #endif #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X", - count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn)); + elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX] tli %u, seg: %X/%X, key_start: %u_%X/%X", + count, offset, offset, tli, LSN_FORMAT_ARGS(segno), key->tli, LSN_FORMAT_ARGS(key->start_lsn)); #endif CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix); @@ -272,9 +300,11 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset, XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn); - pg_tde_wal_last_key_set_lsn(lsn, EncryptionState->db_map_path); + pg_tde_wal_last_key_set_lsn(lsn, tli, EncryptionState->db_map_path); EncryptionKey.start_lsn = lsn; + EncryptionKey.tli = tli; TDEXLogSetEncKeyLsn(lsn); + TDEXLogSetEncKeyTli(tli); } if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED) @@ -291,14 +321,10 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, TimeLineID tli, XLogSegNo segno, int segSize) { ssize_t readsz; - WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys(); - XLogRecPtr write_key_lsn; - XLogRecPtr data_start; - XLogRecPtr data_end; #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X", - count, offset, offset, LSN_FORMAT_ARGS(segno)); + elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], tli: %u, seg: %X/%X", + count, offset, offset, tli, LSN_FORMAT_ARGS(segno)); #endif readsz = pg_pread(fd, buf, count, offset); @@ -306,6 +332,25 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, if (readsz <= 0) return readsz; + TDEXLogCryptBuffer(buf, count, offset, tli, segno, segSize); + + return readsz; +} + +/* + * [De]Crypt buffer if needed based on provided segment offset, number and TLI + */ +void +TDEXLogCryptBuffer(void *buf, size_t count, off_t offset, + TimeLineID tli, XLogSegNo segno, int segSize) +{ + WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys(); + XLogRecPtr write_key_lsn; + XLogRecPtr data_start; + XLogRecPtr data_end; + KeyTliLsn data_start_t = {.tli = tli}; + KeyTliLsn data_end_t = {.tli = tli}; + if (!keys) { /* cache is empty, try to read keys from disk */ @@ -317,11 +362,14 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, if (!XLogRecPtrIsInvalid(write_key_lsn)) { WALKeyCacheRec *last_key = pg_tde_get_last_wal_key(); + KeyTliLsn last_key_time = {.tli = last_key->start_tli, .lsn = last_key->start_lsn}; + KeyTliLsn write_key_time = {.tli = TDEXLogGetEncKeyTli(), .lsn = write_key_lsn}; Assert(last_key); /* write has generated a new key, need to fetch it */ - if (last_key->start_lsn < write_key_lsn) + if (key_tli_lsn_cmp(last_key_time, write_key_time) == -1) + // if (last_key->start_lsn < write_key_lsn) { pg_tde_fetch_wal_keys(write_key_lsn); @@ -331,7 +379,10 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, } XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start); - XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end); + XLogSegNoOffsetToRecPtr(segno, offset + count, segSize, data_end); + + data_start_t.lsn = data_start; + data_end_t.lsn = data_end; /* * TODO: this is higly ineffective. We should get rid of linked list and @@ -339,10 +390,13 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, */ for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next) { + KeyTliLsn key_start_t = {.lsn = curr_key->start_lsn, .tli = curr_key->start_tli}; + KeyTliLsn key_end_t = {.lsn = curr_key->end_lsn, .tli = curr_key->end_tli}; + #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s", - LSN_FORMAT_ARGS(curr_key->start_lsn), - LSN_FORMAT_ARGS(curr_key->end_lsn), + elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s", + curr_key->start_tli, LSN_FORMAT_ARGS(curr_key->start_lsn), + curr_key->end_tli, LSN_FORMAT_ARGS(curr_key->end_lsn), curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no"); #endif @@ -353,7 +407,7 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, * Check if the key's range overlaps with the buffer's and decypt * the part that does. */ - if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn) + if (key_tli_lsn_cmp(data_start_t, key_end_t) == -1 && key_tli_lsn_cmp(data_end_t, key_start_t) == 1) { char iv_prefix[16]; off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize); @@ -368,22 +422,20 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, /* We have reached the end of the segment */ if (dec_end == 0) { - dec_end = offset + readsz; + dec_end = offset + count; } dec_sz = dec_end - dec_off; #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X", - dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn)); + elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu] tli %u, sz: %lu | key %u_%X/%X", + dec_off, dec_off - offset, tli, dec_sz, curr_key->key.tli, LSN_FORMAT_ARGS(curr_key->start_lsn)); #endif pg_tde_stream_crypt(iv_prefix, dec_off, dec_buf, dec_sz, dec_buf, &curr_key->key, &curr_key->crypt_ctx); } } } - - return readsz; } union u128cast diff --git a/contrib/pg_tde/src/include/access/pg_tde_tdemap.h b/contrib/pg_tde/src/include/access/pg_tde_tdemap.h index e6d6d982404ec..e02b1c03a18a8 100644 --- a/contrib/pg_tde/src/include/access/pg_tde_tdemap.h +++ b/contrib/pg_tde/src/include/access/pg_tde_tdemap.h @@ -25,8 +25,33 @@ typedef struct InternalKey uint32 type; XLogRecPtr start_lsn; + TimeLineID tli; } InternalKey; +typedef struct KeyTliLsn +{ + TimeLineID tli; + XLogRecPtr lsn; +} KeyTliLsn; + +static inline int +key_tli_lsn_cmp(KeyTliLsn t1, KeyTliLsn t2) +{ + if (t1.tli < t2.tli) + return -1; + + if (t1.tli > t2.tli) + return 1; + + if (t1.lsn < t2.lsn) + return -1; + + if (t1.lsn > t2.lsn) + return 1; + + return 0; +} + #define MAP_ENTRY_IV_SIZE 16 #define MAP_ENTRY_AEAD_TAG_SIZE 16 @@ -62,6 +87,8 @@ typedef struct WALKeyCacheRec { XLogRecPtr start_lsn; XLogRecPtr end_lsn; + TimeLineID start_tli; + TimeLineID end_tli; InternalKey key; void *crypt_ctx; @@ -73,7 +100,7 @@ extern InternalKey *pg_tde_read_last_wal_key(void); extern WALKeyCacheRec *pg_tde_get_last_wal_key(void); extern WALKeyCacheRec *pg_tde_fetch_wal_keys(XLogRecPtr start_lsn); extern WALKeyCacheRec *pg_tde_get_wal_cache_keys(void); -extern void pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path); +extern void pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, TimeLineID tli, const char *keyfile_path); extern void pg_tde_create_wal_key(InternalKey *rel_key_data, const RelFileLocator *newrlocator, TDEMapEntryType entry_type); #define PG_TDE_MAP_FILENAME "%d_keys" diff --git a/contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h b/contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h index 5956c4b9d12fd..6b5a95d3d5b45 100644 --- a/contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h +++ b/contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h @@ -12,4 +12,7 @@ extern void TDEXLogShmemInit(void); extern void TDEXLogSmgrInit(void); extern void TDEXLogSmgrInitWrite(bool encrypt_xlog); +extern void TDEXLogCryptBuffer(void *buf, size_t count, off_t offset, + TimeLineID tli, XLogSegNo segno, int segSize); + #endif /* PG_TDE_XLOGSMGR_H */ diff --git a/contrib/pg_tde/t/RewindTest.pm b/contrib/pg_tde/t/RewindTest.pm index 0fa74f2b3150f..2ddc013e9ed95 100644 --- a/contrib/pg_tde/t/RewindTest.pm +++ b/contrib/pg_tde/t/RewindTest.pm @@ -44,6 +44,7 @@ use PostgreSQL::Test::Cluster; use PostgreSQL::Test::RecursiveCopy; use PostgreSQL::Test::Utils; use Test::More; +use pgtde; our @EXPORT = qw( $node_primary @@ -199,7 +200,7 @@ sub create_standby $node_standby = PostgreSQL::Test::Cluster->new( 'standby' . ($extra_name ? "_${extra_name}" : '')); - $node_primary->backup('my_backup'); + PGTDE::backup($node_primary, 'my_backup'); $node_standby->init_from_backup($node_primary, 'my_backup'); my $connstr_primary = $node_primary->connstr(); diff --git a/contrib/pg_tde/t/pgtde.pm b/contrib/pg_tde/t/pgtde.pm index eb5c02b24ec17..5666f52cc3892 100644 --- a/contrib/pg_tde/t/pgtde.pm +++ b/contrib/pg_tde/t/pgtde.pm @@ -108,4 +108,17 @@ sub compare_results return compare($expected_filename_with_path, $out_filename_with_path); } +sub backup +{ + my ($node, $backup_name, %params) = @_; + my $backup_dir = $node->backup_dir . '/'. $backup_name; + + mkdir $backup_dir; + + PostgreSQL::Test::RecursiveCopy::copypath($node->data_dir . '/pg_tde', + $backup_dir . '/pg_tde'); + + $node->backup($backup_name, %params); +} + 1; diff --git a/contrib/pg_tde/t/standby_source.pl b/contrib/pg_tde/t/standby_source.pl new file mode 100644 index 0000000000000..002f99b401d69 --- /dev/null +++ b/contrib/pg_tde/t/standby_source.pl @@ -0,0 +1,183 @@ + +# Copyright (c) 2021-2024, PostgreSQL Global Development Group + +# +# Test using a standby server as the source. +# +# This sets up three nodes: A, B and C. First, A is the primary, +# B follows A, and C follows B: +# +# A (primary) <--- B (standby) <--- C (standby) +# +# +# Then we promote C, and insert some divergent rows in A and C: +# +# A (primary) <--- B (standby) C (primary) +# +# +# Finally, we run pg_rewind on C, to re-point it at B again: +# +# A (primary) <--- B (standby) <--- C (standby) +# +# +# The test is similar to the basic tests, but since we're dealing with +# three nodes, not two, we cannot use most of the RewindTest functions +# as is. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; +use File::Copy; +use PostgreSQL::Test::Cluster; +use RewindTest; +use pgtde; + +my $tmp_folder = PostgreSQL::Test::Utils::tempdir; + +my $node_a; +my $node_b; +my $node_c; + +# Set up node A, as primary +# +# A (primary) + +setup_cluster('a'); +start_primary(); +$node_a = $node_primary; + +# Create a test table and insert a row in primary. +$node_a->safe_psql('postgres', "CREATE TABLE tbl1 (d text)"); +$node_a->safe_psql('postgres', "INSERT INTO tbl1 VALUES ('in A')"); +primary_psql("CHECKPOINT"); + +# Set up node B and C, as cascaded standbys +# +# A (primary) <--- B (standby) <--- C (standby) +my $backup_name = 'my_backup'; + +PGTDE::backup(node_a, $backup_name); +$node_b = PostgreSQL::Test::Cluster->new('node_b'); +$node_b->init_from_backup($node_a, $backup_name, has_streaming => 1); +$node_b->set_standby_mode(); +$node_b->start; + +PostgreSQL::Test::RecursiveCopy::copypath($node_b->data_dir . '/pg_tde', + $node_b->backup_dir . '/'. $backup_name . '/pg_tde'); +PGTDE::backup(node_b, $backup_name); +$node_c = PostgreSQL::Test::Cluster->new('node_c'); +$node_c->init_from_backup($node_b, $backup_name, has_streaming => 1); +$node_c->set_standby_mode(); +$node_c->start; + +# Insert additional data on A, and wait for both standbys to catch up. +$node_a->safe_psql('postgres', + "INSERT INTO tbl1 values ('in A, before promotion')"); +$node_a->safe_psql('postgres', 'CHECKPOINT'); + +my $lsn = $node_a->lsn('write'); +$node_a->wait_for_catchup('node_b', 'write', $lsn); +$node_b->wait_for_catchup('node_c', 'write', $lsn); + +# Promote C +# +# A (primary) <--- B (standby) C (primary) + +$node_c->promote; + + +# Insert a row in A. This causes A/B and C to have "diverged", so that it's +# no longer possible to just apply the standby's logs over primary directory +# - you need to rewind. +$node_a->safe_psql('postgres', + "INSERT INTO tbl1 VALUES ('in A, after C was promoted')"); + +# make sure it's replicated to B before we continue +$node_a->wait_for_catchup('node_b'); + +# Also insert a new row in the standby, which won't be present in the +# old primary. +$node_c->safe_psql('postgres', + "INSERT INTO tbl1 VALUES ('in C, after C was promoted')"); + + +# +# All set up. We're ready to run pg_rewind. +# +my $node_c_pgdata = $node_c->data_dir; + +# Stop the node and be ready to perform the rewind. +$node_c->stop('fast'); + +# Keep a temporary postgresql.conf or it would be overwritten during the rewind. +copy( + "$node_c_pgdata/postgresql.conf", + "$tmp_folder/node_c-postgresql.conf.tmp"); + +{ + # Temporarily unset PGAPPNAME so that the server doesn't + # inherit it. Otherwise this could affect libpqwalreceiver + # connections in confusing ways. + local %ENV = %ENV; + delete $ENV{PGAPPNAME}; + + # Do rewind using a remote connection as source, generating + # recovery configuration automatically. + command_ok( + [ + 'pg_rewind', "--debug", + "--source-server", $node_b->connstr('postgres'), + "--target-pgdata=$node_c_pgdata", "--no-sync", + "--write-recovery-conf" + ], + 'pg_rewind remote'); +} + +# Now move back postgresql.conf with old settings +move( + "$tmp_folder/node_c-postgresql.conf.tmp", + "$node_c_pgdata/postgresql.conf"); + +# Restart the node. +$node_c->start; + +# set RewindTest::node_primary to point to the rewound node, so that we can +# use check_query() +$node_primary = $node_c; + +# Run some checks to verify that C has been successfully rewound, +# and connected back to follow B. + +check_query( + 'SELECT * FROM tbl1', + qq(in A +in A, before promotion +in A, after C was promoted +), + 'table content after rewind'); + +# Insert another row, and observe that it's cascaded from A to B to C. +$node_a->safe_psql('postgres', + "INSERT INTO tbl1 values ('in A, after rewind')"); + +$node_b->wait_for_replay_catchup('node_c', $node_a); + +check_query( + 'SELECT * FROM tbl1', + qq(in A +in A, before promotion +in A, after C was promoted +in A, after rewind +), + 'table content after rewind and insert'); + +# clean up +$node_a->teardown_node; +$node_b->teardown_node; +$node_c->teardown_node; + +done_testing(); diff --git a/src/bin/pg_basebackup/Makefile b/src/bin/pg_basebackup/Makefile index 26c53e473f560..64147cde3e1ba 100644 --- a/src/bin/pg_basebackup/Makefile +++ b/src/bin/pg_basebackup/Makefile @@ -44,6 +44,17 @@ BBOBJS = \ bbstreamer_tar.o \ bbstreamer_zstd.o +ifeq ($(enable_percona_ext),yes) + +OBJS += \ + xlogreader.o \ + $(top_srcdir)/src/fe_utils/simple_list.o \ + $(top_builddir)/src/libtde/libtdexlog.a \ + $(top_builddir)/src/libtde/libtde.a + +override CPPFLAGS := -I$(top_srcdir)/contrib/pg_tde/src/include -I$(top_srcdir)/contrib/pg_tde/src/libkmip/libkmip/include -DFRONTEND $(CPPFLAGS) +endif + all: pg_basebackup pg_createsubscriber pg_receivewal pg_recvlogical pg_basebackup: $(BBOBJS) $(OBJS) | submake-libpq submake-libpgport submake-libpgfeutils @@ -58,6 +69,9 @@ pg_receivewal: pg_receivewal.o $(OBJS) | submake-libpq submake-libpgport submake pg_recvlogical: pg_recvlogical.o $(OBJS) | submake-libpq submake-libpgport submake-libpgfeutils $(CC) $(CFLAGS) pg_recvlogical.o $(OBJS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) +xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% + rm -f $@ && $(LN_S) $< . + install: all installdirs $(INSTALL_PROGRAM) pg_basebackup$(X) '$(DESTDIR)$(bindir)/pg_basebackup$(X)' $(INSTALL_PROGRAM) pg_createsubscriber$(X) '$(DESTDIR)$(bindir)/pg_createsubscriber$(X)' @@ -76,7 +90,7 @@ uninstall: clean distclean: rm -f pg_basebackup$(X) pg_createsubscriber$(X) pg_receivewal$(X) pg_recvlogical$(X) \ $(BBOBJS) pg_createsubscriber.o pg_receivewal.o pg_recvlogical.o \ - $(OBJS) + $(OBJS) xlogreader.c rm -rf tmp_check check: diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c index 0be39dddc977a..ef310a770ec1c 100644 --- a/src/bin/pg_basebackup/bbstreamer_file.c +++ b/src/bin/pg_basebackup/bbstreamer_file.c @@ -18,6 +18,10 @@ #include "common/logging.h" #include "common/string.h" +#ifdef PERCONA_EXT +#include "pg_tde.h" +#endif + typedef struct bbstreamer_plain_writer { bbstreamer base; @@ -226,7 +230,9 @@ bbstreamer_extractor_content(bbstreamer *streamer, bbstreamer_member *member, /* Dispatch based on file type. */ if (member->is_directory) + { extract_directory(mystreamer->filename, member->mode); + } else if (member->is_link) { const char *linktarget = member->linktarget; @@ -236,9 +242,19 @@ bbstreamer_extractor_content(bbstreamer *streamer, bbstreamer_member *member, extract_link(mystreamer->filename, linktarget); } else + { +#ifdef PERCONA_EXT + /* + * Don't rewrite WAL keys and providers. User may have different + * one on source and target. + */ + if (strncmp(member->pathname, "pg_tde/1664_", 12) == 0) + break; +#endif mystreamer->file = create_file_for_extract(mystreamer->filename, member->mode); + } /* Report output file change. */ if (mystreamer->report_output_file) @@ -297,7 +313,8 @@ should_allow_existing_directory(const char *pathname) strcmp(filename, "pg_xlog") == 0 || strcmp(filename, "archive_status") == 0 || strcmp(filename, "summaries") == 0 || - strcmp(filename, "pg_tblspc") == 0) + strcmp(filename, "pg_tblspc") == 0 || + strcmp(filename, PG_TDE_DATA_DIR) == 0) return true; if (strspn(filename, "0123456789") == strlen(filename)) diff --git a/src/bin/pg_basebackup/meson.build b/src/bin/pg_basebackup/meson.build index c00acd5e11828..8e15c24e42725 100644 --- a/src/bin/pg_basebackup/meson.build +++ b/src/bin/pg_basebackup/meson.build @@ -12,10 +12,15 @@ common_sources = files( 'walmethods.c', ) +common_sources += xlogreader_sources + pg_basebackup_deps = [frontend_code, libpq, lz4, zlib, zstd] pg_basebackup_common = static_library('libpg_basebackup_common', common_sources, + c_args: ['-DFRONTEND'], # needed for xlogreader et al + link_with: pg_tde_frontend, dependencies: pg_basebackup_deps, + include_directories: pg_tde_inc, kwargs: internal_lib_args, ) @@ -34,6 +39,7 @@ pg_basebackup = executable('pg_basebackup', link_with: [pg_basebackup_common], dependencies: pg_basebackup_deps, kwargs: default_bin_args, + include_directories: pg_tde_inc, ) bin_targets += pg_basebackup @@ -71,6 +77,7 @@ pg_receivewal = executable('pg_receivewal', link_with: [pg_basebackup_common], dependencies: pg_basebackup_deps, kwargs: default_bin_args, + include_directories: pg_tde_inc, ) bin_targets += pg_receivewal diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 8f3dd04fd2226..2cc6551d35d53 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -38,7 +38,14 @@ #include "receivelog.h" #include "streamutil.h" -#define ERRCODE_DATA_CORRUPTED "XX001" +#ifdef PERCONA_EXT +#include "access/pg_tde_fe_init.h" +#include "access/pg_tde_xlog_smgr.h" +#include "access/xlog_smgr.h" +#include "pg_tde.h" +#endif + +#define ERRCODE_DATA_CORRUPTED_BCP "XX001" typedef struct TablespaceListCell { @@ -389,6 +396,12 @@ tablespace_list_append(const char *arg) static void usage(void) { +#ifdef PERCONA_EXT +#define WAL_METHODS "none|stream" +#else +#define WAL_METHODS "none|fetch|stream" +#endif + printf(_("%s takes a base backup of a running PostgreSQL server.\n\n"), progname); printf(_("Usage:\n")); @@ -407,7 +420,7 @@ usage(void) printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n" " relocate tablespace in OLDDIR to NEWDIR\n")); printf(_(" --waldir=WALDIR location for the write-ahead log directory\n")); - printf(_(" -X, --wal-method=none|fetch|stream\n" + printf(_(" -X, --wal-method=" WAL_METHODS "\n" " include required WAL files with specified method\n")); printf(_(" -z, --gzip compress tar output\n")); printf(_(" -Z, --compress=[{client|server}-]METHOD[:DETAIL]\n" @@ -654,6 +667,16 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, PQserverVersion(conn) < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal"); +#ifdef PERCONA_EXT +{ + char tdedir[MAXPGPATH]; + + snprintf(tdedir, sizeof(tdedir), "%s/%s", basedir, PG_TDE_DATA_DIR); + pg_tde_fe_init(tdedir); + TDEXLogSmgrInit(); +} +#endif + /* Temporary replication slots are only supported in 10 and newer */ if (PQserverVersion(conn) < MINIMUM_VERSION_FOR_TEMP_SLOTS) temp_replication_slot = false; @@ -770,6 +793,14 @@ verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found) case 3: case 4: +#ifdef PERCONA_EXT + /* + * `pg_tde` may exists and contain keys and providers for the WAL + * encryption + */ + if (strcmp(dirname, PG_TDE_DATA_DIR)) + return; +#endif /* * Exists, not empty */ @@ -2201,7 +2232,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, const char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); if (sqlstate && - strcmp(sqlstate, ERRCODE_DATA_CORRUPTED) == 0) + strcmp(sqlstate, ERRCODE_DATA_CORRUPTED_BCP) == 0) { pg_log_error("checksum error occurred"); checksum_failure = true; @@ -2524,6 +2555,9 @@ main(int argc, char **argv) else if (strcmp(optarg, "f") == 0 || strcmp(optarg, "fetch") == 0) { +#ifdef PERCONA_EXT + pg_fatal("\"fetch\" wal-method is not supported with Percona features, must be \"stream\" or \"none\""); +#endif includewal = FETCH_WAL; } else if (strcmp(optarg, "s") == 0 || diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 8543f3576a85d..5f40193e5f14a 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -25,6 +25,12 @@ #include "receivelog.h" #include "streamutil.h" +#ifdef PERCONA_EXT +#include "access/pg_tde_fe_init.h" +#include "access/pg_tde_xlog_smgr.h" +#include "catalog/tde_global_space.h" +#endif + /* currently open WAL file */ static Walfile *walfile = NULL; static bool reportFlushPosition = false; @@ -1044,6 +1050,9 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, int bytes_left; int bytes_written; int hdr_len; +#ifdef PERCONA_EXT + XLogSegNo segno; +#endif /* * Once we've decided we don't want to receive any more, just ignore any @@ -1071,6 +1080,10 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, /* Extract WAL location for this block */ xlogoff = XLogSegmentOffset(*blockpos, WalSegSz); +#ifdef PERCONA_EXT + XLByteToSeg(*blockpos, segno, WalSegSz); +#endif + /* * Verify that the initial location in the stream matches where we think * we are. @@ -1121,6 +1134,11 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, } } +#ifdef PERCONA_EXT + TDEXLogCryptBuffer(copybuf + hdr_len + bytes_written, bytes_to_write, + xlogoff, stream->timeline, segno, WalSegSz); +#endif + if (stream->walmethod->ops->write(walfile, copybuf + hdr_len + bytes_written, bytes_to_write) != bytes_to_write)