Skip to content

Commit e146225

Browse files
authored
Merge pull request #309 from postgrespro/issue_308
[Issue #308] Wait on empty exclusive lock file
2 parents 53be624 + 94ada4c commit e146225

File tree

4 files changed

+105
-59
lines changed

4 files changed

+105
-59
lines changed

src/catalog.c

Lines changed: 49 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
166166
*
167167
* TODO: lock-timeout as parameter
168168
* TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
169+
* TODO: more accurate naming
170+
* -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
171+
* -> shared lock -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
169172
*/
170173
bool
171174
lock_backup(pgBackup *backup, bool strict, bool exclusive)
@@ -205,7 +208,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
205208
{
206209
/* release exclusive lock */
207210
if (fio_unlink(lock_file, FIO_BACKUP_HOST) < 0)
208-
elog(ERROR, "Could not remove old lock file \"%s\": %s",
211+
elog(ERROR, "Could not remove exclusive lock file \"%s\": %s",
209212
lock_file, strerror(errno));
210213

211214
/* we are done */
@@ -261,48 +264,16 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
261264
int fd = 0;
262265
char buffer[MAXPGPATH * 2 + 256];
263266
int ntries = LOCK_TIMEOUT;
264-
int log_freq = ntries / 5;
267+
int empty_tries = LOCK_STALE_TIMEOUT;
265268
int len;
266269
int encoded_pid;
267-
pid_t my_p_pid;
268270

269271
join_path_components(lock_file, backup->root_dir, BACKUP_LOCK_FILE);
270272

271-
/*
272-
* TODO: is this stuff with ppid below is relevant for us ?
273-
*
274-
* If the PID in the lockfile is our own PID or our parent's or
275-
* grandparent's PID, then the file must be stale (probably left over from
276-
* a previous system boot cycle). We need to check this because of the
277-
* likelihood that a reboot will assign exactly the same PID as we had in
278-
* the previous reboot, or one that's only one or two counts larger and
279-
* hence the lockfile's PID now refers to an ancestor shell process. We
280-
* allow pg_ctl to pass down its parent shell PID (our grandparent PID)
281-
* via the environment variable PG_GRANDPARENT_PID; this is so that
282-
* launching the postmaster via pg_ctl can be just as reliable as
283-
* launching it directly. There is no provision for detecting
284-
* further-removed ancestor processes, but if the init script is written
285-
* carefully then all but the immediate parent shell will be root-owned
286-
* processes and so the kill test will fail with EPERM. Note that we
287-
* cannot get a false negative this way, because an existing postmaster
288-
* would surely never launch a competing postmaster or pg_ctl process
289-
* directly.
290-
*/
291-
#ifndef WIN32
292-
my_p_pid = getppid();
293-
#else
294-
295-
/*
296-
* Windows hasn't got getppid(), but doesn't need it since it's not using
297-
* real kill() either...
298-
*/
299-
my_p_pid = 0;
300-
#endif
301-
302273
/*
303274
* We need a loop here because of race conditions. But don't loop forever
304275
* (for example, a non-writable $backup_instance_path directory might cause a failure
305-
* that won't go away). 100 tries seems like plenty.
276+
* that won't go away).
306277
*/
307278
do
308279
{
@@ -351,13 +322,38 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
351322
fclose(fp_out);
352323

353324
/*
354-
* It should be possible only as a result of system crash,
355-
* so its hypothetical owner should be dead by now
325+
* There are several possible reasons for lock file
326+
* to be empty:
327+
* - system crash
328+
* - process crash
329+
* - race between writer and reader
330+
*
331+
* Consider empty file to be stale after LOCK_STALE_TIMEOUT attempts.
332+
*
333+
* TODO: alternatively we can write into temp file (lock_file_%pid),
334+
* rename it and then re-read lock file to make sure,
335+
* that we are successfully acquired the lock.
356336
*/
357337
if (len == 0)
358338
{
359-
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
360-
goto grab_lock;
339+
if (empty_tries == 0)
340+
{
341+
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
342+
goto grab_lock;
343+
}
344+
345+
if ((empty_tries % LOG_FREQ) == 0)
346+
elog(WARNING, "Waiting %u seconds on empty exclusive lock for backup %s",
347+
empty_tries, base36enc(backup->start_time));
348+
349+
sleep(1);
350+
/*
351+
* waiting on empty lock file should not affect
352+
* the timer for concurrent lockers (ntries).
353+
*/
354+
empty_tries--;
355+
ntries++;
356+
continue;
361357
}
362358

363359
encoded_pid = atoi(buffer);
@@ -371,24 +367,23 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
371367

372368
/*
373369
* Check to see if the other process still exists
374-
*
375-
* Per discussion above, my_pid, my_p_pid can be
376-
* ignored as false matches.
377-
*
378370
* Normally kill() will fail with ESRCH if the given PID doesn't
379371
* exist.
380372
*/
381-
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
373+
if (encoded_pid == my_pid)
374+
return 0;
375+
else
382376
{
383377
if (kill(encoded_pid, 0) == 0)
384378
{
385379
/* complain every fifth interval */
386-
if ((ntries % log_freq) == 0)
380+
if ((ntries % LOG_FREQ) == 0)
387381
{
388382
elog(WARNING, "Process %d is using backup %s, and is still running",
389383
encoded_pid, base36enc(backup->start_time));
390384

391-
elog(WARNING, "Waiting %u seconds on lock for backup %s", ntries, base36enc(backup->start_time));
385+
elog(WARNING, "Waiting %u seconds on exclusive lock for backup %s",
386+
ntries, base36enc(backup->start_time));
392387
}
393388

394389
sleep(1);
@@ -435,7 +430,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
435430
errno = 0;
436431
if (fio_write(fd, buffer, strlen(buffer)) != strlen(buffer))
437432
{
438-
int save_errno = errno;
433+
int save_errno = errno;
439434

440435
fio_close(fd);
441436
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -453,7 +448,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
453448

454449
if (fio_flush(fd) != 0)
455450
{
456-
int save_errno = errno;
451+
int save_errno = errno;
457452

458453
fio_close(fd);
459454
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -471,7 +466,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
471466

472467
if (fio_close(fd) != 0)
473468
{
474-
int save_errno = errno;
469+
int save_errno = errno;
475470

476471
fio_unlink(lock_file, FIO_BACKUP_HOST);
477472

@@ -482,6 +477,10 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
482477
lock_file, strerror(save_errno));
483478
}
484479

480+
// elog(LOG, "Acquired exclusive lock for backup %s after %ds",
481+
// base36enc(backup->start_time),
482+
// LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);
483+
485484
return 0;
486485
}
487486

@@ -493,7 +492,6 @@ wait_read_only_owners(pgBackup *backup)
493492
char buffer[256];
494493
pid_t encoded_pid;
495494
int ntries = LOCK_TIMEOUT;
496-
int log_freq = ntries / 5;
497495
char lock_file[MAXPGPATH];
498496

499497
join_path_components(lock_file, backup->root_dir, BACKUP_RO_LOCK_FILE);
@@ -523,7 +521,7 @@ wait_read_only_owners(pgBackup *backup)
523521
{
524522
if (kill(encoded_pid, 0) == 0)
525523
{
526-
if ((ntries % log_freq) == 0)
524+
if ((ntries % LOG_FREQ) == 0)
527525
{
528526
elog(WARNING, "Process %d is using backup %s in read only mode, and is still running",
529527
encoded_pid, base36enc(backup->start_time));

src/pg_probackup.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,9 @@ extern const char *PROGRAM_EMAIL;
7979
/* Timeout defaults */
8080
#define ARCHIVE_TIMEOUT_DEFAULT 300
8181
#define REPLICA_TIMEOUT_DEFAULT 300
82-
#define LOCK_TIMEOUT 30
82+
#define LOCK_TIMEOUT 60
83+
#define LOCK_STALE_TIMEOUT 30
84+
#define LOG_FREQ 10
8385

8486
/* Directory/File permission */
8587
#define DIR_PERMISSION (0700)

tests/helpers/ptrack_helpers.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,7 @@ def run_pb(self, command, asynchronous=False, gdb=False, old_binary=False, retur
757757
return GDBobj([binary_path] + command, self.verbose)
758758
if asynchronous:
759759
return subprocess.Popen(
760-
self.cmd,
760+
[binary_path] + command,
761761
stdout=subprocess.PIPE,
762762
stderr=subprocess.PIPE,
763763
env=env
@@ -1133,8 +1133,8 @@ def show_archive(
11331133
exit(1)
11341134

11351135
def validate_pb(
1136-
self, backup_dir, instance=None,
1137-
backup_id=None, options=[], old_binary=False, gdb=False
1136+
self, backup_dir, instance=None, backup_id=None,
1137+
options=[], old_binary=False, gdb=False, asynchronous=False
11381138
):
11391139

11401140
cmd_list = [
@@ -1146,11 +1146,11 @@ def validate_pb(
11461146
if backup_id:
11471147
cmd_list += ['-i', backup_id]
11481148

1149-
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
1149+
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
11501150

11511151
def delete_pb(
1152-
self, backup_dir, instance,
1153-
backup_id=None, options=[], old_binary=False, gdb=False):
1152+
self, backup_dir, instance, backup_id=None,
1153+
options=[], old_binary=False, gdb=False, asynchronous=False):
11541154
cmd_list = [
11551155
'delete',
11561156
'-B', backup_dir
@@ -1160,7 +1160,7 @@ def delete_pb(
11601160
if backup_id:
11611161
cmd_list += ['-i', backup_id]
11621162

1163-
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
1163+
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
11641164

11651165
def delete_expired(
11661166
self, backup_dir, instance, options=[], old_binary=False):

tests/locking.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,52 @@ def test_backup_directory_name(self):
535535
# Clean after yourself
536536
self.del_test_dir(module_name, fname)
537537

538+
def test_empty_lock_file(self):
539+
"""
540+
https://github.com/postgrespro/pg_probackup/issues/308
541+
"""
542+
fname = self.id().split('.')[3]
543+
node = self.make_simple_node(
544+
base_dir=os.path.join(module_name, fname, 'node'),
545+
initdb_params=['--data-checksums'])
546+
547+
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
548+
self.init_pb(backup_dir)
549+
self.add_instance(backup_dir, 'node', node)
550+
self.set_archiving(backup_dir, 'node', node)
551+
node.slow_start()
552+
553+
# Fill with data
554+
node.pgbench_init(scale=100)
555+
556+
# FULL
557+
backup_id = self.backup_node(backup_dir, 'node', node)
558+
559+
lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
560+
with open(lockfile, "w+") as f:
561+
f.truncate()
562+
563+
out = self.validate_pb(backup_dir, 'node', backup_id)
564+
565+
self.assertIn(
566+
"Waiting 30 seconds on empty exclusive lock for backup", out)
567+
568+
# lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
569+
# with open(lockfile, "w+") as f:
570+
# f.truncate()
571+
#
572+
# p1 = self.validate_pb(backup_dir, 'node', backup_id, asynchronous=True,
573+
# options=['--log-level-file=LOG', '--log-filename=validate.log'])
574+
# sleep(3)
575+
# p2 = self.delete_pb(backup_dir, 'node', backup_id, asynchronous=True,
576+
# options=['--log-level-file=LOG', '--log-filename=delete.log'])
577+
#
578+
# p1.wait()
579+
# p2.wait()
580+
581+
# Clean after yourself
582+
self.del_test_dir(module_name, fname)
583+
538584
# TODO:
539585
# test that concurrent validation and restore are not locking each other
540586
# check that quick exclusive lock, when taking RO-lock, is really quick

0 commit comments

Comments
 (0)