@@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
166
166
*
167
167
* TODO: lock-timeout as parameter
168
168
* TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
169
+ * TODO: more accurate naming
170
+ * -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
171
+ * -> shared lock -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
169
172
*/
170
173
bool
171
174
lock_backup (pgBackup * backup , bool strict , bool exclusive )
@@ -205,7 +208,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
205
208
{
206
209
/* release exclusive lock */
207
210
if (fio_unlink (lock_file , FIO_BACKUP_HOST ) < 0 )
208
- elog (ERROR , "Could not remove old lock file \"%s\": %s" ,
211
+ elog (ERROR , "Could not remove exclusive lock file \"%s\": %s" ,
209
212
lock_file , strerror (errno ));
210
213
211
214
/* we are done */
@@ -261,48 +264,16 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
261
264
int fd = 0 ;
262
265
char buffer [MAXPGPATH * 2 + 256 ];
263
266
int ntries = LOCK_TIMEOUT ;
264
- int log_freq = ntries / 5 ;
267
+ int empty_tries = LOCK_STALE_TIMEOUT ;
265
268
int len ;
266
269
int encoded_pid ;
267
- pid_t my_p_pid ;
268
270
269
271
join_path_components (lock_file , backup -> root_dir , BACKUP_LOCK_FILE );
270
272
271
- /*
272
- * TODO: is this stuff with ppid below is relevant for us ?
273
- *
274
- * If the PID in the lockfile is our own PID or our parent's or
275
- * grandparent's PID, then the file must be stale (probably left over from
276
- * a previous system boot cycle). We need to check this because of the
277
- * likelihood that a reboot will assign exactly the same PID as we had in
278
- * the previous reboot, or one that's only one or two counts larger and
279
- * hence the lockfile's PID now refers to an ancestor shell process. We
280
- * allow pg_ctl to pass down its parent shell PID (our grandparent PID)
281
- * via the environment variable PG_GRANDPARENT_PID; this is so that
282
- * launching the postmaster via pg_ctl can be just as reliable as
283
- * launching it directly. There is no provision for detecting
284
- * further-removed ancestor processes, but if the init script is written
285
- * carefully then all but the immediate parent shell will be root-owned
286
- * processes and so the kill test will fail with EPERM. Note that we
287
- * cannot get a false negative this way, because an existing postmaster
288
- * would surely never launch a competing postmaster or pg_ctl process
289
- * directly.
290
- */
291
- #ifndef WIN32
292
- my_p_pid = getppid ();
293
- #else
294
-
295
- /*
296
- * Windows hasn't got getppid(), but doesn't need it since it's not using
297
- * real kill() either...
298
- */
299
- my_p_pid = 0 ;
300
- #endif
301
-
302
273
/*
303
274
* We need a loop here because of race conditions. But don't loop forever
304
275
* (for example, a non-writable $backup_instance_path directory might cause a failure
305
- * that won't go away). 100 tries seems like plenty.
276
+ * that won't go away).
306
277
*/
307
278
do
308
279
{
@@ -351,13 +322,38 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
351
322
fclose (fp_out );
352
323
353
324
/*
354
- * It should be possible only as a result of system crash,
355
- * so its hypothetical owner should be dead by now
325
+ * There are several possible reasons for lock file
326
+ * to be empty:
327
+ * - system crash
328
+ * - process crash
329
+ * - race between writer and reader
330
+ *
331
+ * Consider empty file to be stale after LOCK_STALE_TIMEOUT attempts.
332
+ *
333
+ * TODO: alternatively we can write into temp file (lock_file_%pid),
334
+ * rename it and then re-read lock file to make sure,
335
+ * that we are successfully acquired the lock.
356
336
*/
357
337
if (len == 0 )
358
338
{
359
- elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
360
- goto grab_lock ;
339
+ if (empty_tries == 0 )
340
+ {
341
+ elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
342
+ goto grab_lock ;
343
+ }
344
+
345
+ if ((empty_tries % LOG_FREQ ) == 0 )
346
+ elog (WARNING , "Waiting %u seconds on empty exclusive lock for backup %s" ,
347
+ empty_tries , base36enc (backup -> start_time ));
348
+
349
+ sleep (1 );
350
+ /*
351
+ * waiting on empty lock file should not affect
352
+ * the timer for concurrent lockers (ntries).
353
+ */
354
+ empty_tries -- ;
355
+ ntries ++ ;
356
+ continue ;
361
357
}
362
358
363
359
encoded_pid = atoi (buffer );
@@ -371,24 +367,23 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
371
367
372
368
/*
373
369
* Check to see if the other process still exists
374
- *
375
- * Per discussion above, my_pid, my_p_pid can be
376
- * ignored as false matches.
377
- *
378
370
* Normally kill() will fail with ESRCH if the given PID doesn't
379
371
* exist.
380
372
*/
381
- if (encoded_pid != my_pid && encoded_pid != my_p_pid )
373
+ if (encoded_pid == my_pid )
374
+ return 0 ;
375
+ else
382
376
{
383
377
if (kill (encoded_pid , 0 ) == 0 )
384
378
{
385
379
/* complain every fifth interval */
386
- if ((ntries % log_freq ) == 0 )
380
+ if ((ntries % LOG_FREQ ) == 0 )
387
381
{
388
382
elog (WARNING , "Process %d is using backup %s, and is still running" ,
389
383
encoded_pid , base36enc (backup -> start_time ));
390
384
391
- elog (WARNING , "Waiting %u seconds on lock for backup %s" , ntries , base36enc (backup -> start_time ));
385
+ elog (WARNING , "Waiting %u seconds on exclusive lock for backup %s" ,
386
+ ntries , base36enc (backup -> start_time ));
392
387
}
393
388
394
389
sleep (1 );
@@ -435,7 +430,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
435
430
errno = 0 ;
436
431
if (fio_write (fd , buffer , strlen (buffer )) != strlen (buffer ))
437
432
{
438
- int save_errno = errno ;
433
+ int save_errno = errno ;
439
434
440
435
fio_close (fd );
441
436
fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -453,7 +448,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
453
448
454
449
if (fio_flush (fd ) != 0 )
455
450
{
456
- int save_errno = errno ;
451
+ int save_errno = errno ;
457
452
458
453
fio_close (fd );
459
454
fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -471,7 +466,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
471
466
472
467
if (fio_close (fd ) != 0 )
473
468
{
474
- int save_errno = errno ;
469
+ int save_errno = errno ;
475
470
476
471
fio_unlink (lock_file , FIO_BACKUP_HOST );
477
472
@@ -482,6 +477,10 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
482
477
lock_file , strerror (save_errno ));
483
478
}
484
479
480
+ // elog(LOG, "Acquired exclusive lock for backup %s after %ds",
481
+ // base36enc(backup->start_time),
482
+ // LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);
483
+
485
484
return 0 ;
486
485
}
487
486
@@ -493,7 +492,6 @@ wait_read_only_owners(pgBackup *backup)
493
492
char buffer [256 ];
494
493
pid_t encoded_pid ;
495
494
int ntries = LOCK_TIMEOUT ;
496
- int log_freq = ntries / 5 ;
497
495
char lock_file [MAXPGPATH ];
498
496
499
497
join_path_components (lock_file , backup -> root_dir , BACKUP_RO_LOCK_FILE );
@@ -523,7 +521,7 @@ wait_read_only_owners(pgBackup *backup)
523
521
{
524
522
if (kill (encoded_pid , 0 ) == 0 )
525
523
{
526
- if ((ntries % log_freq ) == 0 )
524
+ if ((ntries % LOG_FREQ ) == 0 )
527
525
{
528
526
elog (WARNING , "Process %d is using backup %s in read only mode, and is still running" ,
529
527
encoded_pid , base36enc (backup -> start_time ));
0 commit comments