From dedcfd4e3da1297cda2d09776e7c8135e0d960ce Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Wed, 9 Oct 2019 19:38:26 -0600 Subject: tls: Add distributed_stek module This migrates a feature that was previously reserved for enterprise users, according to https://github.com/caddyserver/caddy/issues/2786. TLS session ticket keys are sensitive, so they should be rotated on a regular basis. Only Caddy does this by default. However, a cluster of servers that rotate keys without synchronization will lose the benefits of having sessions in the first place if the client is routed to a different backend. This module coordinates STEK rotation in a fleet so the same keys are used, and rotated, across the whole cluster. No other server does this, but Twitter wrote about how they hacked together a solution a few years ago: https://blog.twitter.com/engineering/en_us/a/2013/forward-secrecy-at-twitter.html --- .../caddytls/distributedstek/distributedstek.go | 228 +++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 modules/caddytls/distributedstek/distributedstek.go (limited to 'modules/caddytls/distributedstek') diff --git a/modules/caddytls/distributedstek/distributedstek.go b/modules/caddytls/distributedstek/distributedstek.go new file mode 100644 index 0000000..a0c4cd2 --- /dev/null +++ b/modules/caddytls/distributedstek/distributedstek.go @@ -0,0 +1,228 @@ +// Copyright 2015 Matthew Holt and The Caddy Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package distributedstek provides TLS session ticket ephemeral +// keys (STEKs) in a distributed fashion by utilizing configured +// storage for locking and key sharing. This allows a cluster of +// machines to optimally resume TLS sessions in a load-balanced +// environment without any hassle. This is similar to what +// Twitter does, but without needing to rely on SSH, as it is +// built into the web server this way: +// https://blog.twitter.com/engineering/en_us/a/2013/forward-secrecy-at-twitter.html +package distributedstek + +import ( + "bytes" + "encoding/gob" + "encoding/json" + "fmt" + "log" + "time" + + "github.com/caddyserver/caddy/v2" + "github.com/caddyserver/caddy/v2/modules/caddytls" + "github.com/mholt/certmagic" +) + +func init() { + caddy.RegisterModule(Provider{}) +} + +// Provider implements a distributed STEK provider. +type Provider struct { + Storage json.RawMessage `json:"storage,omitempty"` + + storage certmagic.Storage + stekConfig *caddytls.SessionTicketService + timer *time.Timer +} + +// CaddyModule returns the Caddy module information. +func (Provider) CaddyModule() caddy.ModuleInfo { + return caddy.ModuleInfo{ + Name: "tls.stek.distributed", + New: func() caddy.Module { return new(Provider) }, + } +} + +// Provision provisions s. +func (s *Provider) Provision(ctx caddy.Context) error { + // unpack the storage module to use, if different from the default + if s.Storage != nil { + val, err := ctx.LoadModuleInline("module", "caddy.storage", s.Storage) + if err != nil { + return fmt.Errorf("loading TLS storage module: %s", err) + } + cmStorage, err := val.(caddy.StorageConverter).CertMagicStorage() + if err != nil { + return fmt.Errorf("creating TLS storage configuration: %v", err) + } + s.storage = cmStorage + s.Storage = nil // allow GC to deallocate + } + + // otherwise, use default storage + if s.storage == nil { + s.storage = ctx.Storage() + } + + return nil +} + +// Initialize sets the configuration for s and returns the starting keys. +func (s *Provider) Initialize(config *caddytls.SessionTicketService) ([][32]byte, error) { + // keep a reference to the config; we'll need it when rotating keys + s.stekConfig = config + + dstek, err := s.getSTEK() + if err != nil { + return nil, err + } + + // create timer for the remaining time on the interval; + // this timer is cleaned up only when rotate() returns + s.timer = time.NewTimer(time.Until(dstek.NextRotation)) + + return dstek.Keys, nil +} + +// Next returns a channel which transmits the latest session ticket keys. +func (s *Provider) Next(doneChan <-chan struct{}) <-chan [][32]byte { + keysChan := make(chan [][32]byte) + go s.rotate(doneChan, keysChan) + return keysChan +} + +func (s *Provider) loadSTEK() (distributedSTEK, error) { + var sg distributedSTEK + gobBytes, err := s.storage.Load(stekFileName) + if err != nil { + return sg, err // don't wrap, in case error is certmagic.ErrNotExist + } + dec := gob.NewDecoder(bytes.NewReader(gobBytes)) + err = dec.Decode(&sg) + if err != nil { + return sg, fmt.Errorf("STEK gob corrupted: %v", err) + } + return sg, nil +} + +func (s *Provider) storeSTEK(dstek distributedSTEK) error { + var buf bytes.Buffer + err := gob.NewEncoder(&buf).Encode(dstek) + if err != nil { + return fmt.Errorf("encoding STEK gob: %v", err) + } + err = s.storage.Store(stekFileName, buf.Bytes()) + if err != nil { + return fmt.Errorf("storing STEK gob: %v", err) + } + return nil +} + +// getSTEK locks and loads the current STEK from storage. If none +// currently exists, a new STEK is created and persisted. If the +// current STEK is outdated (NextRotation time is in the past), +// then it is rotated and persisted. The resulting STEK is returned. +func (s *Provider) getSTEK() (distributedSTEK, error) { + s.storage.Lock(stekLockName) + defer s.storage.Unlock(stekLockName) + + // load the current STEKs from storage + dstek, err := s.loadSTEK() + if _, isNotExist := err.(certmagic.ErrNotExist); isNotExist { + // if there is none, then make some right away + dstek, err = s.rotateKeys(dstek) + if err != nil { + return dstek, fmt.Errorf("creating new STEK: %v", err) + } + } else if err != nil { + // some other error, that's a problem + return dstek, fmt.Errorf("loading STEK: %v", err) + } else if time.Now().After(dstek.NextRotation) { + // if current STEKs are outdated, rotate them + dstek, err = s.rotateKeys(dstek) + if err != nil { + return dstek, fmt.Errorf("rotating keys: %v", err) + } + } + + return dstek, nil +} + +// rotateKeys rotates the keys of oldSTEK and returns the new distributedSTEK +// with updated keys and timestamps. It stores the returned STEK in storage, +// so this function must only be called in a storage-provided lock. +func (s *Provider) rotateKeys(oldSTEK distributedSTEK) (distributedSTEK, error) { + var newSTEK distributedSTEK + var err error + + newSTEK.Keys, err = s.stekConfig.RotateSTEKs(oldSTEK.Keys) + if err != nil { + return newSTEK, err + } + + now := time.Now() + newSTEK.LastRotation = now + newSTEK.NextRotation = now.Add(time.Duration(s.stekConfig.RotationInterval)) + + err = s.storeSTEK(newSTEK) + if err != nil { + return newSTEK, err + } + + return newSTEK, nil +} + +// rotate rotates keys on a regular basis, sending each updated set of +// keys down keysChan, until doneChan is closed. +func (s *Provider) rotate(doneChan <-chan struct{}, keysChan chan<- [][32]byte) { + for { + select { + case <-s.timer.C: + dstek, err := s.getSTEK() + if err != nil { + // TODO: improve this handling + log.Printf("[ERROR] Loading STEK: %v", err) + continue + } + + // send the updated keys to the service + keysChan <- dstek.Keys + + // timer channel is already drained, so reset directly (see godoc) + s.timer.Reset(time.Until(dstek.NextRotation)) + + case <-doneChan: + // again, see godocs for why timer is stopped this way + if !s.timer.Stop() { + <-s.timer.C + } + return + } + } +} + +type distributedSTEK struct { + Keys [][32]byte + LastRotation, NextRotation time.Time +} + +const ( + stekLockName = "stek_check" + stekFileName = "stek/stek.bin" +) + +// Interface guard +var _ caddytls.STEKProvider = (*Provider)(nil) -- cgit v1.2.3