pgxpool: health check should avoid going below minConns

pull/1232/head
James Hartig 2022-05-26 10:49:57 -04:00 committed by Jack Christensen
parent 37c3f157bc
commit a814153aeb
4 changed files with 212 additions and 45 deletions

View File

@ -2,7 +2,7 @@ package pgxpool
import (
"context"
"time"
"sync/atomic"
"github.com/jackc/pgconn"
"github.com/jackc/pgx/v4"
@ -26,9 +26,23 @@ func (c *Conn) Release() {
res := c.res
c.res = nil
now := time.Now()
if conn.IsClosed() || conn.PgConn().IsBusy() || conn.PgConn().TxStatus() != 'I' || (now.Sub(res.CreationTime()) > c.p.maxConnLifetime) {
if conn.IsClosed() || conn.PgConn().IsBusy() || conn.PgConn().TxStatus() != 'I' {
res.Destroy()
// Signal to the health check to run since we just destroyed a connections
// and we might be below minConns now
c.p.triggerHealthCheck()
return
}
// If the pool is consistently being used, we might never get to check the
// lifetime of a connection since we only check idle connections in checkConnsHealth
// so we also check the lifetime here and force a health check
if c.p.isExpired(res) {
atomic.AddInt64(&c.p.lifetimeDestroyCount, 1)
res.Destroy()
// Signal to the health check to run since we just destroyed a connections
// and we might be below minConns now
c.p.triggerHealthCheck()
return
}
@ -42,6 +56,9 @@ func (c *Conn) Release() {
res.Release()
} else {
res.Destroy()
// Signal to the health check to run since we just destroyed a connections
// and we might be below minConns now
c.p.triggerHealthCheck()
}
}()
}

View File

@ -3,9 +3,11 @@ package pgxpool
import (
"context"
"fmt"
"math/rand"
"runtime"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/jackc/pgconn"
@ -70,16 +72,23 @@ func (cr *connResource) getPoolRows(c *Conn, r pgx.Rows) *poolRows {
// Pool allows for connection reuse.
type Pool struct {
p *puddle.Pool
config *Config
beforeConnect func(context.Context, *pgx.ConnConfig) error
afterConnect func(context.Context, *pgx.Conn) error
beforeAcquire func(context.Context, *pgx.Conn) bool
afterRelease func(*pgx.Conn) bool
minConns int32
maxConnLifetime time.Duration
maxConnIdleTime time.Duration
healthCheckPeriod time.Duration
p *puddle.Pool
config *Config
beforeConnect func(context.Context, *pgx.ConnConfig) error
afterConnect func(context.Context, *pgx.Conn) error
beforeAcquire func(context.Context, *pgx.Conn) bool
afterRelease func(*pgx.Conn) bool
minConns int32
maxConns int32
maxConnLifetime time.Duration
maxConnLifetimeJitter time.Duration
maxConnIdleTime time.Duration
healthCheckPeriod time.Duration
healthCheckChan chan struct{}
newConnsCount int64
lifetimeDestroyCount int64
idleDestroyCount int64
closeOnce sync.Once
closeChan chan struct{}
@ -109,14 +118,19 @@ type Config struct {
// MaxConnLifetime is the duration since creation after which a connection will be automatically closed.
MaxConnLifetime time.Duration
// MaxConnLifetimeJitter is the duration after MaxConnLifetime to randomly decide to close a connection.
// This helps prevent all connections from being closed at the exact same time, starving the pool.
MaxConnLifetimeJitter time.Duration
// MaxConnIdleTime is the duration after which an idle connection will be automatically closed by the health check.
MaxConnIdleTime time.Duration
// MaxConns is the maximum size of the pool. The default is the greater of 4 or runtime.NumCPU().
MaxConns int32
// MinConns is the minimum size of the pool. The health check will increase the number of connections to this
// amount if it had dropped below.
// MinConns is the minimum size of the pool. After connection closes, the pool might dip below MinConns. A low
// number of MinConns might mean the pool is empty after MaxConnLifetime until the health check has a chance
// to create new connections.
MinConns int32
// HealthCheckPeriod is the duration between checks of the health of idle connections.
@ -164,16 +178,19 @@ func ConnectConfig(ctx context.Context, config *Config) (*Pool, error) {
}
p := &Pool{
config: config,
beforeConnect: config.BeforeConnect,
afterConnect: config.AfterConnect,
beforeAcquire: config.BeforeAcquire,
afterRelease: config.AfterRelease,
minConns: config.MinConns,
maxConnLifetime: config.MaxConnLifetime,
maxConnIdleTime: config.MaxConnIdleTime,
healthCheckPeriod: config.HealthCheckPeriod,
closeChan: make(chan struct{}),
config: config,
beforeConnect: config.BeforeConnect,
afterConnect: config.AfterConnect,
beforeAcquire: config.BeforeAcquire,
afterRelease: config.AfterRelease,
minConns: config.MinConns,
maxConns: config.MaxConns,
maxConnLifetime: config.MaxConnLifetime,
maxConnLifetimeJitter: config.MaxConnLifetimeJitter,
maxConnIdleTime: config.MaxConnIdleTime,
healthCheckPeriod: config.HealthCheckPeriod,
healthCheckChan: make(chan struct{}, 1),
closeChan: make(chan struct{}),
}
p.p = puddle.NewPool(
@ -223,7 +240,7 @@ func ConnectConfig(ctx context.Context, config *Config) (*Pool, error) {
)
if !config.LazyConnect {
if err := p.createIdleResources(ctx, int(p.minConns)); err != nil {
if err := p.checkMinConns(); err != nil {
// Couldn't create resources for minpool size. Close unhealthy pool.
p.Close()
return nil, err
@ -251,6 +268,7 @@ func ConnectConfig(ctx context.Context, config *Config) (*Pool, error) {
// pool_max_conn_lifetime: duration string
// pool_max_conn_idle_time: duration string
// pool_health_check_period: duration string
// pool_max_conn_lifetime_jitter: duration string
//
// See Config for definitions of these arguments.
//
@ -331,6 +349,15 @@ func ParseConfig(connString string) (*Config, error) {
config.HealthCheckPeriod = defaultHealthCheckPeriod
}
if s, ok := config.ConnConfig.Config.RuntimeParams["pool_max_conn_lifetime_jitter"]; ok {
delete(connConfig.Config.RuntimeParams, "pool_max_conn_lifetime_jitter")
d, err := time.ParseDuration(s)
if err != nil {
return nil, fmt.Errorf("invalid pool_max_conn_lifetime_jitter: %w", err)
}
config.MaxConnLifetimeJitter = d
}
return config, nil
}
@ -343,44 +370,105 @@ func (p *Pool) Close() {
})
}
func (p *Pool) isExpired(res *puddle.Resource) bool {
now := time.Now()
// Small optimization to avoid rand. If it's over lifetime AND jitter, immediately
// return true.
if now.Sub(res.CreationTime()) > p.maxConnLifetime+p.maxConnLifetimeJitter {
return true
}
if p.maxConnLifetimeJitter == 0 {
return false
}
jitterSecs := rand.Float64() * p.maxConnLifetimeJitter.Seconds()
return now.Sub(res.CreationTime()) > p.maxConnLifetime+(time.Duration(jitterSecs)*time.Second)
}
func (p *Pool) triggerHealthCheck() {
go func() {
// Destroy is asynchronous so we give it time to actually remove itself from
// the pool otherwise we might try to check the pool size too soon
time.Sleep(500 * time.Millisecond)
select {
case p.healthCheckChan <- struct{}{}:
default:
}
}()
}
func (p *Pool) backgroundHealthCheck() {
ticker := time.NewTicker(p.healthCheckPeriod)
defer ticker.Stop()
for {
select {
case <-p.closeChan:
ticker.Stop()
return
case <-p.healthCheckChan:
p.checkHealth()
case <-ticker.C:
p.checkIdleConnsHealth()
p.checkMinConns()
p.checkHealth()
}
}
}
func (p *Pool) checkIdleConnsHealth() {
resources := p.p.AcquireAllIdle()
func (p *Pool) checkHealth() {
for {
// If checkMinConns failed we don't destroy any connections since we couldn't
// even get to minConns
if err := p.checkMinConns(); err != nil {
// Should we log this error somewhere?
break
}
if !p.checkConnsHealth() {
// Since we didn't destroy any connections we can stop looping
break
}
// Technically Destroy is asynchronous but 500ms should be enough for it to
// remove it from the underlying pool
select {
case <-p.closeChan:
return
case <-time.After(500 * time.Millisecond):
}
}
}
now := time.Now()
// checkConnsHealth will check all idle connections, destroy a connection if
// it's idle or too old, and returns true if any were destroyed
func (p *Pool) checkConnsHealth() bool {
var destroyed bool
totalConns := p.Stat().TotalConns()
resources := p.p.AcquireAllIdle()
for _, res := range resources {
if now.Sub(res.CreationTime()) > p.maxConnLifetime {
// We're okay going under minConns if the lifetime is up
if p.isExpired(res) && totalConns >= p.minConns {
atomic.AddInt64(&p.lifetimeDestroyCount, 1)
res.Destroy()
} else if res.IdleDuration() > p.maxConnIdleTime {
destroyed = true
// Since Destroy is async we manually decrement totalConns.
totalConns--
} else if res.IdleDuration() > p.maxConnIdleTime && totalConns > p.minConns {
atomic.AddInt64(&p.idleDestroyCount, 1)
res.Destroy()
destroyed = true
// Since Destroy is async we manually decrement totalConns.
totalConns--
} else {
res.ReleaseUnused()
}
}
return destroyed
}
func (p *Pool) checkMinConns() {
for i := p.minConns - p.Stat().TotalConns(); i > 0; i-- {
go func() {
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
defer cancel()
p.p.CreateResource(ctx)
}()
func (p *Pool) checkMinConns() error {
// TotalConns can include ones that are being destroyed but we should have
// sleep(500ms) around all of the destroys to help prevent that from throwing
// off this check
toCreate := p.minConns - p.Stat().TotalConns()
if toCreate > 0 {
return p.createIdleResources(context.Background(), int(toCreate))
}
return nil
}
func (p *Pool) createIdleResources(parentCtx context.Context, targetResources int) error {
@ -391,6 +479,7 @@ func (p *Pool) createIdleResources(parentCtx context.Context, targetResources in
for i := 0; i < targetResources; i++ {
go func() {
atomic.AddInt64(&p.newConnsCount, 1)
err := p.p.CreateResource(ctx)
errs <- err
}()
@ -460,7 +549,12 @@ func (p *Pool) Config() *Config { return p.config.Copy() }
// Stat returns a pgxpool.Stat struct with a snapshot of Pool statistics.
func (p *Pool) Stat() *Stat {
return &Stat{s: p.p.Stat()}
return &Stat{
s: p.p.Stat(),
newConnsCount: atomic.LoadInt64(&p.newConnsCount),
lifetimeDestroyCount: atomic.LoadInt64(&p.lifetimeDestroyCount),
idleDestroyCount: atomic.LoadInt64(&p.idleDestroyCount),
}
}
// Exec acquires a connection from the Pool and executes the given SQL.

View File

@ -372,6 +372,14 @@ func TestConnReleaseClosesBusyConn(t *testing.T) {
c.Release()
waitForReleaseToComplete()
// wait for the connection to actually be destroyed
for i := 0; i < 1000; i++ {
if db.Stat().TotalConns() == 0 {
break
}
time.Sleep(time.Millisecond)
}
stats := db.Stat()
assert.EqualValues(t, 0, stats.TotalConns())
}
@ -396,6 +404,8 @@ func TestPoolBackgroundChecksMaxConnLifetime(t *testing.T) {
stats := db.Stat()
assert.EqualValues(t, 0, stats.TotalConns())
assert.EqualValues(t, 0, stats.MaxIdleDestroyCount())
assert.EqualValues(t, 1, stats.MaxLifetimeDestroyCount())
}
func TestPoolBackgroundChecksMaxConnIdleTime(t *testing.T) {
@ -426,6 +436,8 @@ func TestPoolBackgroundChecksMaxConnIdleTime(t *testing.T) {
stats := db.Stat()
assert.EqualValues(t, 0, stats.TotalConns())
assert.EqualValues(t, 1, stats.MaxIdleDestroyCount())
assert.EqualValues(t, 0, stats.MaxLifetimeDestroyCount())
}
func TestPoolBackgroundChecksMinConns(t *testing.T) {
@ -443,6 +455,21 @@ func TestPoolBackgroundChecksMinConns(t *testing.T) {
stats := db.Stat()
assert.EqualValues(t, 2, stats.TotalConns())
assert.EqualValues(t, 0, stats.MaxLifetimeDestroyCount())
assert.EqualValues(t, 2, stats.NewConnsCount())
c, err := db.Acquire(context.Background())
require.NoError(t, err)
err = c.Conn().Close(context.Background())
require.NoError(t, err)
c.Release()
time.Sleep(config.HealthCheckPeriod + 500*time.Millisecond)
stats = db.Stat()
assert.EqualValues(t, 2, stats.TotalConns())
assert.EqualValues(t, 0, stats.MaxIdleDestroyCount())
assert.EqualValues(t, 3, stats.NewConnsCount())
}
func TestPoolExec(t *testing.T) {
@ -679,6 +706,14 @@ func TestConnReleaseDestroysClosedConn(t *testing.T) {
c.Release()
waitForReleaseToComplete()
// wait for the connection to actually be destroyed
for i := 0; i < 1000; i++ {
if pool.Stat().TotalConns() == 0 {
break
}
time.Sleep(time.Millisecond)
}
assert.EqualValues(t, 0, pool.Stat().TotalConns())
}
@ -767,7 +802,7 @@ func TestTxBeginFuncNestedTransactionCommit(t *testing.T) {
require.NoError(t, err)
return nil
})
require.NoError(t, err)
return nil
})
require.NoError(t, err)
@ -817,6 +852,7 @@ func TestTxBeginFuncNestedTransactionRollback(t *testing.T) {
return nil
})
require.NoError(t, err)
var n int64
err = db.QueryRow(context.Background(), "select count(*) from pgxpooltx").Scan(&n)

View File

@ -8,7 +8,10 @@ import (
// Stat is a snapshot of Pool statistics.
type Stat struct {
s *puddle.Stat
s *puddle.Stat
newConnsCount int64
lifetimeDestroyCount int64
idleDestroyCount int64
}
// AcquireCount returns the cumulative count of successful acquires from the pool.
@ -62,3 +65,20 @@ func (s *Stat) MaxConns() int32 {
func (s *Stat) TotalConns() int32 {
return s.s.TotalResources()
}
// NewConnsCount returns the cumulative count of new connections opened.
func (s *Stat) NewConnsCount() int64 {
return s.newConnsCount
}
// MaxLifetimeDestroyCount returns the cumulative count of connections destroyed
// because they exceeded MaxConnLifetime.
func (s *Stat) MaxLifetimeDestroyCount() int64 {
return s.lifetimeDestroyCount
}
// MaxIdleDestroyCount returns the cumulative count of connections destroyed because
// they exceeded MaxConnIdleTime.
func (s *Stat) MaxIdleDestroyCount() int64 {
return s.idleDestroyCount
}