// Package tsm1 provides a TSDB in the Time Structured Merge tree format.
package tsm1 // import "github.com/influxdata/influxdb/tsdb/engine/tsm1"

import (
	"archive/tar"
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"math"
	"os"
	"path/filepath"
	"regexp"
	"runtime"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/influxdata/influxdb/tsdb/index/inmem"

	"github.com/influxdata/influxdb/influxql"
	"github.com/influxdata/influxdb/models"
	"github.com/influxdata/influxdb/pkg/bytesutil"
	"github.com/influxdata/influxdb/pkg/estimator"
	"github.com/influxdata/influxdb/pkg/limiter"
	"github.com/influxdata/influxdb/tsdb"
	_ "github.com/influxdata/influxdb/tsdb/index"
	"github.com/uber-go/zap"
)

//go:generate tmpl -data=@iterator.gen.go.tmpldata iterator.gen.go.tmpl
//go:generate tmpl -data=@file_store.gen.go.tmpldata file_store.gen.go.tmpl
//go:generate tmpl -data=@encoding.gen.go.tmpldata encoding.gen.go.tmpl
//go:generate tmpl -data=@compact.gen.go.tmpldata compact.gen.go.tmpl

func init() {
	tsdb.RegisterEngine("tsm1", NewEngine)
}

var (
	// Ensure Engine implements the interface.
	_ tsdb.Engine = &Engine{}
	// Static objects to prevent small allocs.
	timeBytes              = []byte("time")
	keyFieldSeparatorBytes = []byte(keyFieldSeparator)
)

const (
	// keyFieldSeparator separates the series key from the field name in the composite key
	// that identifies a specific field in series
	keyFieldSeparator = "#!~#"
)

// Statistics gathered by the engine.
const (
	statCacheCompactions        = "cacheCompactions"
	statCacheCompactionsActive  = "cacheCompactionsActive"
	statCacheCompactionError    = "cacheCompactionErr"
	statCacheCompactionDuration = "cacheCompactionDuration"

	statTSMLevel1Compactions        = "tsmLevel1Compactions"
	statTSMLevel1CompactionsActive  = "tsmLevel1CompactionsActive"
	statTSMLevel1CompactionError    = "tsmLevel1CompactionErr"
	statTSMLevel1CompactionDuration = "tsmLevel1CompactionDuration"

	statTSMLevel2Compactions        = "tsmLevel2Compactions"
	statTSMLevel2CompactionsActive  = "tsmLevel2CompactionsActive"
	statTSMLevel2CompactionError    = "tsmLevel2CompactionErr"
	statTSMLevel2CompactionDuration = "tsmLevel2CompactionDuration"

	statTSMLevel3Compactions        = "tsmLevel3Compactions"
	statTSMLevel3CompactionsActive  = "tsmLevel3CompactionsActive"
	statTSMLevel3CompactionError    = "tsmLevel3CompactionErr"
	statTSMLevel3CompactionDuration = "tsmLevel3CompactionDuration"

	statTSMOptimizeCompactions        = "tsmOptimizeCompactions"
	statTSMOptimizeCompactionsActive  = "tsmOptimizeCompactionsActive"
	statTSMOptimizeCompactionError    = "tsmOptimizeCompactionErr"
	statTSMOptimizeCompactionDuration = "tsmOptimizeCompactionDuration"

	statTSMFullCompactions        = "tsmFullCompactions"
	statTSMFullCompactionsActive  = "tsmFullCompactionsActive"
	statTSMFullCompactionError    = "tsmFullCompactionErr"
	statTSMFullCompactionDuration = "tsmFullCompactionDuration"
)

// Engine represents a storage engine with compressed blocks.
type Engine struct {
	mu sync.RWMutex

	// The following group of fields is used to track the state of level compactions within the
	// Engine. The WaitGroup is used to monitor the compaction goroutines, the 'done' channel is
	// used to signal those goroutines to shutdown. Every request to disable level compactions will
	// call 'Wait' on 'wg', with the first goroutine to arrive (levelWorkers == 0 while holding the
	// lock) will close the done channel and re-assign 'nil' to the variable. Re-enabling will
	// decrease 'levelWorkers', and when it decreases to zero, level compactions will be started
	// back up again.

	wg           sync.WaitGroup // waitgroup for active level compaction goroutines
	done         chan struct{}  // channel to signal level compactions to stop
	levelWorkers int            // Number of "workers" that expect compactions to be in a disabled state

	snapDone chan struct{}  // channel to signal snapshot compactions to stop
	snapWG   sync.WaitGroup // waitgroup for running snapshot compactions

	id           uint64
	database     string
	path         string
	logger       zap.Logger // Logger to be used for important messages
	traceLogger  zap.Logger // Logger to be used when trace-logging is on.
	traceLogging bool

	index    tsdb.Index
	fieldset *tsdb.MeasurementFieldSet

	WAL            *WAL
	Cache          *Cache
	Compactor      *Compactor
	CompactionPlan CompactionPlanner
	FileStore      *FileStore

	MaxPointsPerBlock int

	// CacheFlushMemorySizeThreshold specifies the minimum size threshodl for
	// the cache when the engine should write a snapshot to a TSM file
	CacheFlushMemorySizeThreshold uint64

	// CacheFlushWriteColdDuration specifies the length of time after which if
	// no writes have been committed to the WAL, the engine will write
	// a snapshot of the cache to a TSM file
	CacheFlushWriteColdDuration time.Duration

	// Controls whether to enabled compactions when the engine is open
	enableCompactionsOnOpen bool

	stats *EngineStatistics

	// The limiter for concurrent compactions
	compactionLimiter limiter.Fixed
}

// NewEngine returns a new instance of Engine.
func NewEngine(id uint64, idx tsdb.Index, database, path string, walPath string, opt tsdb.EngineOptions) tsdb.Engine {
	w := NewWAL(walPath)
	w.syncDelay = time.Duration(opt.Config.WALFsyncDelay)

	fs := NewFileStore(path)
	cache := NewCache(uint64(opt.Config.CacheMaxMemorySize), path)

	c := &Compactor{
		Dir:       path,
		FileStore: fs,
	}

	logger := zap.New(zap.NullEncoder())
	e := &Engine{
		id:           id,
		database:     database,
		path:         path,
		index:        idx,
		logger:       logger,
		traceLogger:  logger,
		traceLogging: opt.Config.TraceLoggingEnabled,

		fieldset: tsdb.NewMeasurementFieldSet(),

		WAL:   w,
		Cache: cache,

		FileStore:      fs,
		Compactor:      c,
		CompactionPlan: NewDefaultPlanner(fs, time.Duration(opt.Config.CompactFullWriteColdDuration)),

		CacheFlushMemorySizeThreshold: opt.Config.CacheSnapshotMemorySize,
		CacheFlushWriteColdDuration:   time.Duration(opt.Config.CacheSnapshotWriteColdDuration),
		enableCompactionsOnOpen:       true,
		stats:             &EngineStatistics{},
		compactionLimiter: opt.CompactionLimiter,
	}

	// Attach fieldset to index.
	e.index.SetFieldSet(e.fieldset)

	if e.traceLogging {
		fs.enableTraceLogging(true)
		w.enableTraceLogging(true)
	}

	return e
}

// SetEnabled sets whether the engine is enabled.
func (e *Engine) SetEnabled(enabled bool) {
	e.enableCompactionsOnOpen = enabled
	e.SetCompactionsEnabled(enabled)
}

// SetCompactionsEnabled enables compactions on the engine.  When disabled
// all running compactions are aborted and new compactions stop running.
func (e *Engine) SetCompactionsEnabled(enabled bool) {
	if enabled {
		e.enableSnapshotCompactions()
		e.enableLevelCompactions(false)
	} else {
		e.disableSnapshotCompactions()
		e.disableLevelCompactions(false)
	}
}

// enableLevelCompactions will request that level compactions start back up again
//
// 'wait' signifies that a corresponding call to disableLevelCompactions(true) was made at some
// point, and the associated task that required disabled compactions is now complete
func (e *Engine) enableLevelCompactions(wait bool) {
	// If we don't need to wait, see if we're already enabled
	if !wait {
		e.mu.RLock()
		if e.done != nil {
			e.mu.RUnlock()
			return
		}
		e.mu.RUnlock()
	}

	e.mu.Lock()
	if wait {
		e.levelWorkers -= 1
	}
	if e.levelWorkers != 0 || e.done != nil {
		// still waiting on more workers or already enabled
		e.mu.Unlock()
		return
	}

	// last one to enable, start things back up
	e.Compactor.EnableCompactions()
	quit := make(chan struct{})
	e.done = quit

	e.wg.Add(4)
	e.mu.Unlock()

	go func() { defer e.wg.Done(); e.compactTSMFull(quit) }()
	go func() { defer e.wg.Done(); e.compactTSMLevel(true, 1, quit) }()
	go func() { defer e.wg.Done(); e.compactTSMLevel(true, 2, quit) }()
	go func() { defer e.wg.Done(); e.compactTSMLevel(false, 3, quit) }()
}

// disableLevelCompactions will stop level compactions before returning.
//
// If 'wait' is set to true, then a corresponding call to enableLevelCompactions(true) will be
// required before level compactions will start back up again.
func (e *Engine) disableLevelCompactions(wait bool) {
	e.mu.Lock()
	old := e.levelWorkers
	if wait {
		e.levelWorkers += 1
	}

	if old == 0 && e.done != nil {
		// Prevent new compactions from starting
		e.Compactor.DisableCompactions()

		// Stop all background compaction goroutines
		close(e.done)
		e.done = nil

	}

	e.mu.Unlock()
	e.wg.Wait()
}

func (e *Engine) enableSnapshotCompactions() {
	// Check if already enabled under read lock
	e.mu.RLock()
	if e.snapDone != nil {
		e.mu.RUnlock()
		return
	}
	e.mu.RUnlock()

	// Check again under write lock
	e.mu.Lock()
	if e.snapDone != nil {
		e.mu.Unlock()
		return
	}

	e.Compactor.EnableSnapshots()
	quit := make(chan struct{})
	e.snapDone = quit
	e.snapWG.Add(1)
	e.mu.Unlock()

	go func() { defer e.snapWG.Done(); e.compactCache(quit) }()
}

func (e *Engine) disableSnapshotCompactions() {
	e.mu.Lock()

	if e.snapDone != nil {
		close(e.snapDone)
		e.snapDone = nil
		e.Compactor.DisableSnapshots()
	}

	e.mu.Unlock()
	e.snapWG.Wait()

	// If the cache is empty, free up its resources as well.
	if e.Cache.Size() == 0 {
		e.Cache.Free()
	}
}

// Path returns the path the engine was opened with.
func (e *Engine) Path() string { return e.path }

func (e *Engine) SetFieldName(measurement []byte, name string) {
	e.index.SetFieldName(measurement, name)
}

func (e *Engine) MeasurementExists(name []byte) (bool, error) {
	return e.index.MeasurementExists(name)
}

func (e *Engine) MeasurementNamesByExpr(expr influxql.Expr) ([][]byte, error) {
	return e.index.MeasurementNamesByExpr(expr)
}

func (e *Engine) MeasurementNamesByRegex(re *regexp.Regexp) ([][]byte, error) {
	return e.index.MeasurementNamesByRegex(re)
}

// MeasurementFields returns the measurement fields for a measurement.
func (e *Engine) MeasurementFields(measurement []byte) *tsdb.MeasurementFields {
	return e.fieldset.CreateFieldsIfNotExists(measurement)
}

func (e *Engine) ForEachMeasurementSeriesByExpr(name []byte, condition influxql.Expr, fn func(tags models.Tags) error) error {
	return e.index.ForEachMeasurementSeriesByExpr(name, condition, fn)
}

func (e *Engine) HasTagKey(name, key []byte) (bool, error) {
	return e.index.HasTagKey(name, key)
}

func (e *Engine) MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error) {
	return e.index.MeasurementTagKeysByExpr(name, expr)
}

// MeasurementTagKeyValuesByExpr returns a set of tag values filtered by an expression.
//
// MeasurementTagKeyValuesByExpr relies on the provided tag keys being sorted.
// The caller can indicate the tag keys have been sorted by setting the
// keysSorted argument appropriately. Tag values are returned in a slice that
// is indexible according to the sorted order of the tag keys, e.g., the values
// for the earliest tag k will be available in index 0 of the returned values
// slice.
//
func (e *Engine) MeasurementTagKeyValuesByExpr(name []byte, keys []string, expr influxql.Expr, keysSorted bool) ([][]string, error) {
	return e.index.MeasurementTagKeyValuesByExpr(name, keys, expr, keysSorted)
}

func (e *Engine) ForEachMeasurementTagKey(name []byte, fn func(key []byte) error) error {
	return e.index.ForEachMeasurementTagKey(name, fn)
}

func (e *Engine) TagKeyCardinality(name, key []byte) int {
	return e.index.TagKeyCardinality(name, key)
}

// SeriesN returns the unique number of series in the index.
func (e *Engine) SeriesN() int64 {
	return e.index.SeriesN()
}

func (e *Engine) SeriesSketches() (estimator.Sketch, estimator.Sketch, error) {
	return e.index.SeriesSketches()
}

func (e *Engine) MeasurementsSketches() (estimator.Sketch, estimator.Sketch, error) {
	return e.index.MeasurementsSketches()
}

// LastModified returns the time when this shard was last modified.
func (e *Engine) LastModified() time.Time {
	walTime := e.WAL.LastWriteTime()
	fsTime := e.FileStore.LastModified()

	if walTime.After(fsTime) {
		return walTime
	}

	return fsTime
}

// EngineStatistics maintains statistics for the engine.
type EngineStatistics struct {
	CacheCompactions        int64 // Counter of cache compactions that have ever run.
	CacheCompactionsActive  int64 // Gauge of cache compactions currently running.
	CacheCompactionErrors   int64 // Counter of cache compactions that have failed due to error.
	CacheCompactionDuration int64 // Counter of number of wall nanoseconds spent in cache compactions.

	TSMCompactions        [3]int64 // Counter of TSM compactions (by level) that have ever run.
	TSMCompactionsActive  [3]int64 // Gauge of TSM compactions (by level) currently running.
	TSMCompactionErrors   [3]int64 // Counter of TSM compcations (by level) that have failed due to error.
	TSMCompactionDuration [3]int64 // Counter of number of wall nanoseconds spent in TSM compactions (by level).

	TSMOptimizeCompactions        int64 // Counter of optimize compactions that have ever run.
	TSMOptimizeCompactionsActive  int64 // Gauge of optimize compactions currently running.
	TSMOptimizeCompactionErrors   int64 // Counter of optimize compactions that have failed due to error.
	TSMOptimizeCompactionDuration int64 // Counter of number of wall nanoseconds spent in optimize compactions.

	TSMFullCompactions        int64 // Counter of full compactions that have ever run.
	TSMFullCompactionsActive  int64 // Gauge of full compactions currently running.
	TSMFullCompactionErrors   int64 // Counter of full compactions that have failed due to error.
	TSMFullCompactionDuration int64 // Counter of number of wall nanoseconds spent in full compactions.
}

// Statistics returns statistics for periodic monitoring.
func (e *Engine) Statistics(tags map[string]string) []models.Statistic {
	statistics := make([]models.Statistic, 0, 4)
	statistics = append(statistics, models.Statistic{
		Name: "tsm1_engine",
		Tags: tags,
		Values: map[string]interface{}{
			statCacheCompactions:        atomic.LoadInt64(&e.stats.CacheCompactions),
			statCacheCompactionsActive:  atomic.LoadInt64(&e.stats.CacheCompactionsActive),
			statCacheCompactionError:    atomic.LoadInt64(&e.stats.CacheCompactionErrors),
			statCacheCompactionDuration: atomic.LoadInt64(&e.stats.CacheCompactionDuration),

			statTSMLevel1Compactions:        atomic.LoadInt64(&e.stats.TSMCompactions[0]),
			statTSMLevel1CompactionsActive:  atomic.LoadInt64(&e.stats.TSMCompactionsActive[0]),
			statTSMLevel1CompactionError:    atomic.LoadInt64(&e.stats.TSMCompactionErrors[0]),
			statTSMLevel1CompactionDuration: atomic.LoadInt64(&e.stats.TSMCompactionDuration[0]),

			statTSMLevel2Compactions:        atomic.LoadInt64(&e.stats.TSMCompactions[1]),
			statTSMLevel2CompactionsActive:  atomic.LoadInt64(&e.stats.TSMCompactionsActive[1]),
			statTSMLevel2CompactionError:    atomic.LoadInt64(&e.stats.TSMCompactionErrors[1]),
			statTSMLevel2CompactionDuration: atomic.LoadInt64(&e.stats.TSMCompactionDuration[1]),

			statTSMLevel3Compactions:        atomic.LoadInt64(&e.stats.TSMCompactions[2]),
			statTSMLevel3CompactionsActive:  atomic.LoadInt64(&e.stats.TSMCompactionsActive[2]),
			statTSMLevel3CompactionError:    atomic.LoadInt64(&e.stats.TSMCompactionErrors[2]),
			statTSMLevel3CompactionDuration: atomic.LoadInt64(&e.stats.TSMCompactionDuration[2]),

			statTSMOptimizeCompactions:        atomic.LoadInt64(&e.stats.TSMOptimizeCompactions),
			statTSMOptimizeCompactionsActive:  atomic.LoadInt64(&e.stats.TSMOptimizeCompactionsActive),
			statTSMOptimizeCompactionError:    atomic.LoadInt64(&e.stats.TSMOptimizeCompactionErrors),
			statTSMOptimizeCompactionDuration: atomic.LoadInt64(&e.stats.TSMOptimizeCompactionDuration),

			statTSMFullCompactions:        atomic.LoadInt64(&e.stats.TSMFullCompactions),
			statTSMFullCompactionsActive:  atomic.LoadInt64(&e.stats.TSMFullCompactionsActive),
			statTSMFullCompactionError:    atomic.LoadInt64(&e.stats.TSMFullCompactionErrors),
			statTSMFullCompactionDuration: atomic.LoadInt64(&e.stats.TSMFullCompactionDuration),
		},
	})

	statistics = append(statistics, e.Cache.Statistics(tags)...)
	statistics = append(statistics, e.FileStore.Statistics(tags)...)
	statistics = append(statistics, e.WAL.Statistics(tags)...)
	return statistics
}

// DiskSize returns the total size in bytes of all TSM and WAL segments on disk.
func (e *Engine) DiskSize() int64 {
	return e.FileStore.DiskSizeBytes() + e.WAL.DiskSizeBytes()
}

// Open opens and initializes the engine.
func (e *Engine) Open() error {
	if err := os.MkdirAll(e.path, 0777); err != nil {
		return err
	}

	if err := e.cleanup(); err != nil {
		return err
	}

	if err := e.WAL.Open(); err != nil {
		return err
	}

	if err := e.FileStore.Open(); err != nil {
		return err
	}

	if err := e.reloadCache(); err != nil {
		return err
	}

	e.Compactor.Open()

	if e.enableCompactionsOnOpen {
		e.SetCompactionsEnabled(true)
	}

	return nil
}

// Close closes the engine. Subsequent calls to Close are a nop.
func (e *Engine) Close() error {
	e.SetCompactionsEnabled(false)

	// Lock now and close everything else down.
	e.mu.Lock()
	defer e.mu.Unlock()
	e.done = nil // Ensures that the channel will not be closed again.

	if err := e.FileStore.Close(); err != nil {
		return err
	}
	return e.WAL.Close()
}

// WithLogger sets the logger for the engine.
func (e *Engine) WithLogger(log zap.Logger) {
	e.logger = log.With(zap.String("engine", "tsm1"))

	if e.traceLogging {
		e.traceLogger = e.logger
	}

	e.WAL.WithLogger(e.logger)
	e.FileStore.WithLogger(e.logger)
}

// LoadMetadataIndex loads the shard metadata into memory.
func (e *Engine) LoadMetadataIndex(shardID uint64, index tsdb.Index) error {
	now := time.Now()

	// Save reference to index for iterator creation.
	e.index = index

	if err := e.FileStore.WalkKeys(func(key []byte, typ byte) error {
		fieldType, err := tsmFieldTypeToInfluxQLDataType(typ)
		if err != nil {
			return err
		}

		if err := e.addToIndexFromKey(key, fieldType); err != nil {
			return err
		}
		return nil
	}); err != nil {
		return err
	}

	// load metadata from the Cache
	if err := e.Cache.ApplyEntryFn(func(key string, entry *entry) error {
		fieldType, err := entry.values.InfluxQLType()
		if err != nil {
			e.logger.Info(fmt.Sprintf("error getting the data type of values for key %s: %s", key, err.Error()))
		}

		if err := e.addToIndexFromKey([]byte(key), fieldType); err != nil {
			return err
		}
		return nil
	}); err != nil {
		return err
	}

	e.traceLogger.Info(fmt.Sprintf("Meta data index for shard %d loaded in %v", shardID, time.Since(now)))
	return nil
}

// IsIdle returns true if the cache is empty, there are no running compactions and the
// shard is fully compacted.
func (e *Engine) IsIdle() bool {
	cacheEmpty := e.Cache.Size() == 0

	runningCompactions := atomic.LoadInt64(&e.stats.CacheCompactionsActive)
	runningCompactions += atomic.LoadInt64(&e.stats.TSMCompactionsActive[0])
	runningCompactions += atomic.LoadInt64(&e.stats.TSMCompactionsActive[1])
	runningCompactions += atomic.LoadInt64(&e.stats.TSMCompactionsActive[2])
	runningCompactions += atomic.LoadInt64(&e.stats.TSMFullCompactionsActive)
	runningCompactions += atomic.LoadInt64(&e.stats.TSMOptimizeCompactionsActive)

	return cacheEmpty && runningCompactions == 0 && e.CompactionPlan.FullyCompacted()
}

// Backup writes a tar archive of any TSM files modified since the passed
// in time to the passed in writer. The basePath will be prepended to the names
// of the files in the archive. It will force a snapshot of the WAL first
// then perform the backup with a read lock against the file store. This means
// that new TSM files will not be able to be created in this shard while the
// backup is running. For shards that are still acively getting writes, this
// could cause the WAL to backup, increasing memory usage and evenutally rejecting writes.
func (e *Engine) Backup(w io.Writer, basePath string, since time.Time) error {
	path, err := e.CreateSnapshot()
	if err != nil {
		return err
	}

	if err := e.index.SnapshotTo(path); err != nil {
		return err
	}

	tw := tar.NewWriter(w)
	defer tw.Close()

	// Remove the temporary snapshot dir
	defer os.RemoveAll(path)

	// Recursively read all files from path.
	files, err := readDir(path, "")
	if err != nil {
		return err
	}

	// Filter paths to only changed files.
	var filtered []string
	for _, file := range files {
		fi, err := os.Stat(filepath.Join(path, file))
		if err != nil {
			return err
		} else if !fi.ModTime().After(since) {
			continue
		}
		filtered = append(filtered, file)
	}
	if len(filtered) == 0 {
		return nil
	}

	for _, f := range filtered {
		if err := e.writeFileToBackup(f, basePath, filepath.Join(path, f), tw); err != nil {
			return err
		}
	}

	return nil
}

// writeFileToBackup copies the file into the tar archive. Files will use the shardRelativePath
// in their names. This should be the <db>/<retention policy>/<id> part of the path.
func (e *Engine) writeFileToBackup(name string, shardRelativePath, fullPath string, tw *tar.Writer) error {
	f, err := os.Stat(fullPath)
	if err != nil {
		return err
	}

	h := &tar.Header{
		Name:    filepath.ToSlash(filepath.Join(shardRelativePath, name)),
		ModTime: f.ModTime(),
		Size:    f.Size(),
		Mode:    int64(f.Mode()),
	}
	if err := tw.WriteHeader(h); err != nil {
		return err
	}
	fr, err := os.Open(fullPath)
	if err != nil {
		return err
	}

	defer fr.Close()

	_, err = io.CopyN(tw, fr, h.Size)

	return err
}

// Restore reads a tar archive generated by Backup().
// Only files that match basePath will be copied into the directory. This obtains
// a write lock so no operations can be performed while restoring.
func (e *Engine) Restore(r io.Reader, basePath string) error {
	return e.overlay(r, basePath, false)
}

// Import reads a tar archive generated by Backup() and adds each
// file matching basePath as a new TSM file.  This obtains
// a write lock so no operations can be performed while Importing.
func (e *Engine) Import(r io.Reader, basePath string) error {
	return e.overlay(r, basePath, true)
}

// overlay reads a tar archive generated by Backup() and adds each file
// from the archive matching basePath to the shard.
// If asNew is true, each file will be installed as a new TSM file even if an
// existing file with the same name in the backup exists.
func (e *Engine) overlay(r io.Reader, basePath string, asNew bool) error {
	// Copy files from archive while under lock to prevent reopening.
	newFiles, err := func() ([]string, error) {
		e.mu.Lock()
		defer e.mu.Unlock()

		var newFiles []string
		tr := tar.NewReader(r)
		for {
			if fileName, err := e.readFileFromBackup(tr, basePath, asNew); err == io.EOF {
				break
			} else if err != nil {
				return nil, err
			} else if fileName != "" {
				newFiles = append(newFiles, fileName)
			}
		}

		if err := syncDir(e.path); err != nil {
			return nil, err
		}

		if err := e.FileStore.Replace(nil, newFiles); err != nil {
			return nil, err
		}
		return newFiles, nil
	}()

	if err != nil {
		return err
	}

	// Load any new series keys to the index
	readers := make([]chan seriesKey, 0, len(newFiles))
	for _, f := range newFiles {
		ch := make(chan seriesKey, 1)
		readers = append(readers, ch)

		// If asNew is true, the files created from readFileFromBackup will be new ones
		// having a temp extension.
		f = strings.TrimSuffix(f, ".tmp")

		fd, err := os.Open(f)
		if err != nil {
			return err
		}

		r, err := NewTSMReader(fd)
		if err != nil {
			return err
		}
		defer r.Close()

		go func(c chan seriesKey, r *TSMReader) {
			n := r.KeyCount()
			for i := 0; i < n; i++ {
				key, typ := r.KeyAt(i)
				c <- seriesKey{key, typ}
			}
			close(c)
		}(ch, r)
	}

	// Merge and dedup all the series keys across each reader to reduce
	// lock contention on the index.
	merged := merge(readers...)
	for v := range merged {
		fieldType, err := tsmFieldTypeToInfluxQLDataType(v.typ)
		if err != nil {
			return err
		}

		if err := e.addToIndexFromKey(v.key, fieldType); err != nil {
			return err
		}
	}
	return nil
}

// readFileFromBackup copies the next file from the archive into the shard.
// The file is skipped if it does not have a matching shardRelativePath prefix.
// If asNew is true, each file will be installed as a new TSM file even if an
// existing file with the same name in the backup exists.
func (e *Engine) readFileFromBackup(tr *tar.Reader, shardRelativePath string, asNew bool) (string, error) {
	// Read next archive file.
	hdr, err := tr.Next()
	if err != nil {
		return "", err
	}

	nativeFileName := filepath.FromSlash(hdr.Name)

	// Skip file if it does not have a matching prefix.
	if !filepath.HasPrefix(nativeFileName, shardRelativePath) {
		return "", nil
	}
	filename, err := filepath.Rel(shardRelativePath, nativeFileName)
	if err != nil {
		return "", err
	}

	if asNew {
		filename = fmt.Sprintf("%09d-%09d.%s", e.FileStore.NextGeneration(), 1, TSMFileExtension)
	}

	destPath := filepath.Join(e.path, filename)
	tmp := destPath + ".tmp"

	// Create new file on disk.
	f, err := os.OpenFile(tmp, os.O_CREATE|os.O_RDWR, 0666)
	if err != nil {
		return "", err
	}
	defer f.Close()

	// Copy from archive to the file.
	if _, err := io.CopyN(f, tr, hdr.Size); err != nil {
		return "", err
	}

	// Sync to disk & close.
	if err := f.Sync(); err != nil {
		return "", err
	}

	return tmp, nil
}

// addToIndexFromKey will pull the measurement name, series key, and field name from a composite key and add it to the
// database index and measurement fields
func (e *Engine) addToIndexFromKey(key []byte, fieldType influxql.DataType) error {
	seriesKey, field := SeriesAndFieldFromCompositeKey(key)
	name := tsdb.MeasurementFromSeriesKey(seriesKey)

	mf := e.fieldset.CreateFieldsIfNotExists(name)
	if err := mf.CreateFieldIfNotExists(field, fieldType, false); err != nil {
		return err
	}

	// Build in-memory index, if necessary.
	if e.index.Type() == inmem.IndexName {
		tags, _ := models.ParseTags(seriesKey)
		if err := e.index.InitializeSeries(seriesKey, name, tags); err != nil {
			return err
		}
	}

	return nil
}

// WritePoints writes metadata and point data into the engine.
// It returns an error if new points are added to an existing key.
func (e *Engine) WritePoints(points []models.Point) error {
	values := make(map[string][]Value, len(points))
	var keyBuf []byte
	var baseLen int
	for _, p := range points {
		keyBuf = append(keyBuf[:0], p.Key()...)
		keyBuf = append(keyBuf, keyFieldSeparator...)
		baseLen = len(keyBuf)
		iter := p.FieldIterator()
		t := p.Time().UnixNano()
		for iter.Next() {
			// Skip fields name "time", they are illegal
			if bytes.Equal(iter.FieldKey(), timeBytes) {
				continue
			}

			keyBuf = append(keyBuf[:baseLen], iter.FieldKey()...)
			var v Value
			switch iter.Type() {
			case models.Float:
				fv, err := iter.FloatValue()
				if err != nil {
					return err
				}
				v = NewFloatValue(t, fv)
			case models.Integer:
				iv, err := iter.IntegerValue()
				if err != nil {
					return err
				}
				v = NewIntegerValue(t, iv)
			case models.String:
				v = NewStringValue(t, iter.StringValue())
			case models.Boolean:
				bv, err := iter.BooleanValue()
				if err != nil {
					return err
				}
				v = NewBooleanValue(t, bv)
			default:
				return fmt.Errorf("unknown field type for %s: %s", string(iter.FieldKey()), p.String())
			}
			values[string(keyBuf)] = append(values[string(keyBuf)], v)
		}
	}

	e.mu.RLock()
	defer e.mu.RUnlock()

	// first try to write to the cache
	err := e.Cache.WriteMulti(values)
	if err != nil {
		return err
	}

	_, err = e.WAL.WriteMulti(values)
	return err
}

// containsSeries returns a map of keys indicating whether the key exists and
// has values or not.
func (e *Engine) containsSeries(keys [][]byte) (map[string]bool, error) {
	// keyMap is used to see if a given key exists.  keys
	// are the measurement + tagset (minus separate & field)
	keyMap := map[string]bool{}
	for _, k := range keys {
		keyMap[string(k)] = false
	}

	for _, k := range e.Cache.unsortedKeys() {
		seriesKey, _ := SeriesAndFieldFromCompositeKey([]byte(k))
		keyMap[string(seriesKey)] = true
	}

	if err := e.FileStore.WalkKeys(func(k []byte, _ byte) error {
		seriesKey, _ := SeriesAndFieldFromCompositeKey(k)
		if _, ok := keyMap[string(seriesKey)]; ok {
			keyMap[string(seriesKey)] = true
		}
		return nil
	}); err != nil {
		return nil, err
	}

	return keyMap, nil
}

// deleteSeries removes all series keys from the engine.
func (e *Engine) deleteSeries(seriesKeys [][]byte) error {
	return e.DeleteSeriesRange(seriesKeys, math.MinInt64, math.MaxInt64)
}

// DeleteSeriesRange removes the values between min and max (inclusive) from all series.
func (e *Engine) DeleteSeriesRange(seriesKeys [][]byte, min, max int64) error {
	if len(seriesKeys) == 0 {
		return nil
	}

	// Ensure keys are sorted since lower layers require them to be.
	if !bytesutil.IsSorted(seriesKeys) {
		bytesutil.Sort(seriesKeys)
	}

	// Disable and abort running compactions so that tombstones added existing tsm
	// files don't get removed.  This would cause deleted measurements/series to
	// re-appear once the compaction completed.  We only disable the level compactions
	// so that snapshotting does not stop while writing out tombstones.  If it is stopped,
	// and writing tombstones takes a long time, writes can get rejected due to the cache
	// filling up.
	e.disableLevelCompactions(true)
	defer e.enableLevelCompactions(true)

	tempKeys := seriesKeys[:]
	deleteKeys := make([]string, 0, len(seriesKeys))
	// go through the keys in the file store
	if err := e.FileStore.WalkKeys(func(k []byte, _ byte) error {
		seriesKey, _ := SeriesAndFieldFromCompositeKey(k)

		// Both tempKeys and keys walked are sorted, skip any passed in keys
		// that don't exist in our key set.
		for len(tempKeys) > 0 && bytes.Compare(tempKeys[0], seriesKey) < 0 {
			tempKeys = tempKeys[1:]
		}

		// Keys match, add the full series key to delete.
		if len(tempKeys) > 0 && bytes.Equal(tempKeys[0], seriesKey) {
			deleteKeys = append(deleteKeys, string(k))
		}

		return nil
	}); err != nil {
		return err
	}

	if err := e.FileStore.DeleteRange(deleteKeys, min, max); err != nil {
		return err
	}

	// find the keys in the cache and remove them
	walKeys := deleteKeys[:0]

	// ApplySerialEntryFn cannot return an error in this invocation.
	_ = e.Cache.ApplyEntryFn(func(k string, _ *entry) error {
		seriesKey, _ := SeriesAndFieldFromCompositeKey([]byte(k))

		// Cache does not walk keys in sorted order, so search the sorted
		// series we need to delete to see if any of the cache keys match.
		i := bytesutil.SearchBytes(seriesKeys, seriesKey)
		if i < len(seriesKeys) && bytes.Equal(seriesKey, seriesKeys[i]) {
			// k is the measurement + tags + sep + field
			walKeys = append(walKeys, k)
		}
		return nil
	})

	e.Cache.DeleteRange(walKeys, min, max)

	// delete from the WAL
	if _, err := e.WAL.DeleteRange(walKeys, min, max); err != nil {
		return err
	}

	// Have we deleted all points for the series? If so, we need to remove
	// the series from the index.
	existing, err := e.containsSeries(seriesKeys)
	if err != nil {
		return err
	}

	for k, exists := range existing {
		if !exists {
			if err := e.index.UnassignShard(k, e.id); err != nil {
				return err
			}
		}
	}

	return nil
}

// DeleteMeasurement deletes a measurement and all related series.
func (e *Engine) DeleteMeasurement(name []byte) error {
	// Delete the bulk of data outside of the fields lock.
	if err := e.deleteMeasurement(name); err != nil {
		return err
	}

	// Under lock, delete any series created deletion.
	if err := e.fieldset.DeleteWithLock(string(name), func() error {
		return e.deleteMeasurement(name)
	}); err != nil {
		return err
	}

	return nil
}

// DeleteMeasurement deletes a measurement and all related series.
func (e *Engine) deleteMeasurement(name []byte) error {
	// Attempt to find the series keys.
	keys, err := e.index.MeasurementSeriesKeysByExpr(name, nil)
	if err != nil {
		return err
	} else if len(keys) > 0 {
		if err := e.deleteSeries(keys); err != nil {
			return err
		}
	}
	return nil
}

// ForEachMeasurementName iterates over each measurement name in the engine.
func (e *Engine) ForEachMeasurementName(fn func(name []byte) error) error {
	return e.index.ForEachMeasurementName(fn)
}

// MeasurementSeriesKeysByExpr returns a list of series keys matching expr.
func (e *Engine) MeasurementSeriesKeysByExpr(name []byte, expr influxql.Expr) ([][]byte, error) {
	return e.index.MeasurementSeriesKeysByExpr(name, expr)
}

func (e *Engine) CreateSeriesListIfNotExists(keys, names [][]byte, tagsSlice []models.Tags) error {
	return e.index.CreateSeriesListIfNotExists(keys, names, tagsSlice)
}

func (e *Engine) CreateSeriesIfNotExists(key, name []byte, tags models.Tags) error {
	return e.index.CreateSeriesIfNotExists(key, name, tags)
}

// WriteTo is not implemented.
func (e *Engine) WriteTo(w io.Writer) (n int64, err error) { panic("not implemented") }

// WriteSnapshot will snapshot the cache and write a new TSM file with its contents, releasing the snapshot when done.
func (e *Engine) WriteSnapshot() error {
	// Lock and grab the cache snapshot along with all the closed WAL
	// filenames associated with the snapshot

	var started *time.Time

	defer func() {
		if started != nil {
			e.Cache.UpdateCompactTime(time.Since(*started))
			e.logger.Info(fmt.Sprintf("Snapshot for path %s written in %v", e.path, time.Since(*started)))
		}
	}()

	closedFiles, snapshot, err := func() ([]string, *Cache, error) {
		e.mu.Lock()
		defer e.mu.Unlock()

		now := time.Now()
		started = &now

		if err := e.WAL.CloseSegment(); err != nil {
			return nil, nil, err
		}

		segments, err := e.WAL.ClosedSegments()
		if err != nil {
			return nil, nil, err
		}

		snapshot, err := e.Cache.Snapshot()
		if err != nil {
			return nil, nil, err
		}

		return segments, snapshot, nil
	}()

	if err != nil {
		return err
	}

	if snapshot.Size() == 0 {
		e.Cache.ClearSnapshot(true)
		return nil
	}

	// The snapshotted cache may have duplicate points and unsorted data.  We need to deduplicate
	// it before writing the snapshot.  This can be very expensive so it's done while we are not
	// holding the engine write lock.
	dedup := time.Now()
	snapshot.Deduplicate()
	e.traceLogger.Info(fmt.Sprintf("Snapshot for path %s deduplicated in %v", e.path, time.Since(dedup)))

	return e.writeSnapshotAndCommit(closedFiles, snapshot)
}

// CreateSnapshot will create a temp directory that holds
// temporary hardlinks to the underylyng shard files.
func (e *Engine) CreateSnapshot() (string, error) {
	if err := e.WriteSnapshot(); err != nil {
		return "", err
	}

	e.mu.RLock()
	defer e.mu.RUnlock()

	return e.FileStore.CreateSnapshot()
}

// writeSnapshotAndCommit will write the passed cache to a new TSM file and remove the closed WAL segments.
func (e *Engine) writeSnapshotAndCommit(closedFiles []string, snapshot *Cache) (err error) {
	defer func() {
		if err != nil {
			e.Cache.ClearSnapshot(false)
		}
	}()

	// write the new snapshot files
	newFiles, err := e.Compactor.WriteSnapshot(snapshot)
	if err != nil {
		e.logger.Info(fmt.Sprintf("error writing snapshot from compactor: %v", err))
		return err
	}

	e.mu.RLock()
	defer e.mu.RUnlock()

	// update the file store with these new files
	if err := e.FileStore.Replace(nil, newFiles); err != nil {
		e.logger.Info(fmt.Sprintf("error adding new TSM files from snapshot: %v", err))
		return err
	}

	// clear the snapshot from the in-memory cache, then the old WAL files
	e.Cache.ClearSnapshot(true)

	if err := e.WAL.Remove(closedFiles); err != nil {
		e.logger.Info(fmt.Sprintf("error removing closed wal segments: %v", err))
	}

	return nil
}

// compactCache continually checks if the WAL cache should be written to disk.
func (e *Engine) compactCache(quit <-chan struct{}) {
	t := time.NewTicker(time.Second)
	defer t.Stop()
	for {
		select {
		case <-quit:
			return

		case <-t.C:
			e.Cache.UpdateAge()
			if e.ShouldCompactCache(e.WAL.LastWriteTime()) {
				start := time.Now()
				e.traceLogger.Info(fmt.Sprintf("Compacting cache for %s", e.path))
				err := e.WriteSnapshot()
				if err != nil && err != errCompactionsDisabled {
					e.logger.Info(fmt.Sprintf("error writing snapshot: %v", err))
					atomic.AddInt64(&e.stats.CacheCompactionErrors, 1)
				} else {
					atomic.AddInt64(&e.stats.CacheCompactions, 1)
				}
				atomic.AddInt64(&e.stats.CacheCompactionDuration, time.Since(start).Nanoseconds())
			}
		}
	}
}

// ShouldCompactCache returns true if the Cache is over its flush threshold
// or if the passed in lastWriteTime is older than the write cold threshold.
func (e *Engine) ShouldCompactCache(lastWriteTime time.Time) bool {
	sz := e.Cache.Size()

	if sz == 0 {
		return false
	}

	return sz > e.CacheFlushMemorySizeThreshold ||
		time.Since(lastWriteTime) > e.CacheFlushWriteColdDuration
}

func (e *Engine) compactTSMLevel(fast bool, level int, quit <-chan struct{}) {
	t := time.NewTicker(time.Second)
	defer t.Stop()

	for {
		select {
		case <-quit:
			return

		case <-t.C:
			s := e.levelCompactionStrategy(fast, level)
			if s != nil {
				s.Apply()
				// Release the files in the compaction plan
				e.CompactionPlan.Release(s.compactionGroups)
			}

		}
	}
}

func (e *Engine) compactTSMFull(quit <-chan struct{}) {
	t := time.NewTicker(time.Second)
	defer t.Stop()

	for {
		select {
		case <-quit:
			return

		case <-t.C:
			s := e.fullCompactionStrategy()
			if s != nil {
				s.Apply()
				// Release the files in the compaction plan
				e.CompactionPlan.Release(s.compactionGroups)
			}
		}
	}
}

// onFileStoreReplace is callback handler invoked when the FileStore
// has replaced one set of TSM files with a new set.
func (e *Engine) onFileStoreReplace(newFiles []TSMFile) {
	// Load any new series keys to the index
	readers := make([]chan seriesKey, 0, len(newFiles))
	for _, r := range newFiles {
		ch := make(chan seriesKey, 1)
		readers = append(readers, ch)

		go func(c chan seriesKey, r TSMFile) {
			n := r.KeyCount()
			for i := 0; i < n; i++ {
				key, typ := r.KeyAt(i)
				c <- seriesKey{key, typ}
			}
			close(c)
		}(ch, r)
	}

	// Merge and dedup all the series keys across each reader to reduce
	// lock contention on the index.
	merged := merge(readers...)
	for v := range merged {
		fieldType, err := tsmFieldTypeToInfluxQLDataType(v.typ)
		if err != nil {
			e.logger.Error(fmt.Sprintf("refresh index (1): %v", err))
			continue
		}

		if err := e.addToIndexFromKey(v.key, fieldType); err != nil {
			e.logger.Error(fmt.Sprintf("refresh index (2): %v", err))
			continue
		}
	}

	// load metadata from the Cache
	e.Cache.ApplyEntryFn(func(key string, entry *entry) error {
		fieldType, err := entry.InfluxQLType()
		if err != nil {
			e.logger.Error(fmt.Sprintf("refresh index (3): %v", err))
			return nil
		}

		if err := e.addToIndexFromKey([]byte(key), fieldType); err != nil {
			e.logger.Error(fmt.Sprintf("refresh index (4): %v", err))
			return nil
		}
		return nil
	})
}

// compactionStrategy holds the details of what to do in a compaction.
type compactionStrategy struct {
	compactionGroups []CompactionGroup

	// concurrency determines how many compactions groups will be started
	// concurrently.  These groups may be limited by the global limiter if
	// enabled.
	concurrency int
	fast        bool
	description string

	durationStat *int64
	activeStat   *int64
	successStat  *int64
	errorStat    *int64

	logger    zap.Logger
	compactor *Compactor
	fileStore *FileStore
	limiter   limiter.Fixed
	engine    *Engine
}

// Apply concurrently compacts all the groups in a compaction strategy.
func (s *compactionStrategy) Apply() {
	start := time.Now()

	// cap concurrent compaction groups to no more than 4 at a time.
	concurrency := s.concurrency
	if concurrency == 0 {
		concurrency = 4
	}

	throttle := limiter.NewFixed(concurrency)
	var wg sync.WaitGroup
	for i := range s.compactionGroups {
		wg.Add(1)
		go func(groupNum int) {
			defer wg.Done()

			// limit concurrent compaction groups
			throttle.Take()
			defer throttle.Release()

			s.compactGroup(groupNum)
		}(i)
	}
	wg.Wait()

	atomic.AddInt64(s.durationStat, time.Since(start).Nanoseconds())
}

// compactGroup executes the compaction strategy against a single CompactionGroup.
func (s *compactionStrategy) compactGroup(groupNum int) {
	// Limit concurrent compactions if we have a limiter
	if cap(s.limiter) > 0 {
		s.limiter.Take()
		defer s.limiter.Release()
	}

	group := s.compactionGroups[groupNum]
	start := time.Now()
	s.logger.Info(fmt.Sprintf("beginning %s compaction of group %d, %d TSM files", s.description, groupNum, len(group)))
	for i, f := range group {
		s.logger.Info(fmt.Sprintf("compacting %s group (%d) %s (#%d)", s.description, groupNum, f, i))
	}

	files, err := func() ([]string, error) {
		// Count the compaction as active only while the compaction is actually running.
		atomic.AddInt64(s.activeStat, 1)
		defer atomic.AddInt64(s.activeStat, -1)

		if s.fast {
			return s.compactor.CompactFast(group)
		} else {
			return s.compactor.CompactFull(group)
		}
	}()

	if err != nil {
		_, inProgress := err.(errCompactionInProgress)
		if err == errCompactionsDisabled || inProgress {
			s.logger.Info(fmt.Sprintf("aborted %s compaction group (%d). %v", s.description, groupNum, err))

			if _, ok := err.(errCompactionInProgress); ok {
				time.Sleep(time.Second)
			}
			return
		}

		s.logger.Info(fmt.Sprintf("error compacting TSM files: %v", err))
		atomic.AddInt64(s.errorStat, 1)
		time.Sleep(time.Second)
		return
	}

	if err := s.fileStore.ReplaceWithCallback(group, files, s.engine.onFileStoreReplace); err != nil {
		s.logger.Info(fmt.Sprintf("error replacing new TSM files: %v", err))
		atomic.AddInt64(s.errorStat, 1)
		time.Sleep(time.Second)
		return
	}

	for i, f := range files {
		s.logger.Info(fmt.Sprintf("compacted %s group (%d) into %s (#%d)", s.description, groupNum, f, i))
	}
	s.logger.Info(fmt.Sprintf("compacted %s %d files into %d files in %s", s.description, len(group), len(files), time.Since(start)))
	atomic.AddInt64(s.successStat, 1)
}

// levelCompactionStrategy returns a compactionStrategy for the given level.
// It returns nil if there are no TSM files to compact.
func (e *Engine) levelCompactionStrategy(fast bool, level int) *compactionStrategy {
	compactionGroups := e.CompactionPlan.PlanLevel(level)

	if len(compactionGroups) == 0 {
		return nil
	}

	return &compactionStrategy{
		concurrency:      4,
		compactionGroups: compactionGroups,
		logger:           e.logger,
		fileStore:        e.FileStore,
		compactor:        e.Compactor,
		fast:             fast,
		limiter:          e.compactionLimiter,
		engine:           e,

		description:  fmt.Sprintf("level %d", level),
		activeStat:   &e.stats.TSMCompactionsActive[level-1],
		successStat:  &e.stats.TSMCompactions[level-1],
		errorStat:    &e.stats.TSMCompactionErrors[level-1],
		durationStat: &e.stats.TSMCompactionDuration[level-1],
	}
}

// fullCompactionStrategy returns a compactionStrategy for higher level generations of TSM files.
// It returns nil if there are no TSM files to compact.
func (e *Engine) fullCompactionStrategy() *compactionStrategy {
	optimize := false
	compactionGroups := e.CompactionPlan.Plan(e.WAL.LastWriteTime())

	if len(compactionGroups) == 0 {
		optimize = true
		compactionGroups = e.CompactionPlan.PlanOptimize()
	}

	if len(compactionGroups) == 0 {
		return nil
	}

	s := &compactionStrategy{
		concurrency:      1,
		compactionGroups: compactionGroups,
		logger:           e.logger,
		fileStore:        e.FileStore,
		compactor:        e.Compactor,
		fast:             optimize,
		limiter:          e.compactionLimiter,
		engine:           e,
	}

	if optimize {
		s.description = "optimize"
		s.activeStat = &e.stats.TSMOptimizeCompactionsActive
		s.successStat = &e.stats.TSMOptimizeCompactions
		s.errorStat = &e.stats.TSMOptimizeCompactionErrors
		s.durationStat = &e.stats.TSMOptimizeCompactionDuration
	} else {
		s.description = "full"
		s.activeStat = &e.stats.TSMFullCompactionsActive
		s.successStat = &e.stats.TSMFullCompactions
		s.errorStat = &e.stats.TSMFullCompactionErrors
		s.durationStat = &e.stats.TSMFullCompactionDuration
	}

	return s
}

// reloadCache reads the WAL segment files and loads them into the cache.
func (e *Engine) reloadCache() error {
	now := time.Now()
	files, err := segmentFileNames(e.WAL.Path())
	if err != nil {
		return err
	}

	limit := e.Cache.MaxSize()
	defer func() {
		e.Cache.SetMaxSize(limit)
	}()

	// Disable the max size during loading
	e.Cache.SetMaxSize(0)

	loader := NewCacheLoader(files)
	loader.WithLogger(e.logger)
	if err := loader.Load(e.Cache); err != nil {
		return err
	}

	e.traceLogger.Info(fmt.Sprintf("Reloaded WAL cache %s in %v", e.WAL.Path(), time.Since(now)))
	return nil
}

// cleanup removes all temp files and dirs that exist on disk.  This is should only be run at startup to avoid
// removing tmp files that are still in use.
func (e *Engine) cleanup() error {
	allfiles, err := ioutil.ReadDir(e.path)
	if os.IsNotExist(err) {
		return nil
	} else if err != nil {
		return err
	}

	for _, f := range allfiles {
		// Check to see if there are any `.tmp` directories that were left over from failed shard snapshots
		if f.IsDir() && strings.HasSuffix(f.Name(), ".tmp") {
			if err := os.RemoveAll(filepath.Join(e.path, f.Name())); err != nil {
				return fmt.Errorf("error removing tmp snapshot directory %q: %s", f.Name(), err)
			}
		}
	}

	return e.cleanupTempTSMFiles()
}

func (e *Engine) cleanupTempTSMFiles() error {
	files, err := filepath.Glob(filepath.Join(e.path, fmt.Sprintf("*.%s", CompactionTempExtension)))
	if err != nil {
		return fmt.Errorf("error getting compaction temp files: %s", err.Error())
	}

	for _, f := range files {
		if err := os.Remove(f); err != nil {
			return fmt.Errorf("error removing temp compaction files: %v", err)
		}
	}
	return nil
}

// KeyCursor returns a KeyCursor for the given key starting at time t.
func (e *Engine) KeyCursor(key string, t int64, ascending bool) *KeyCursor {
	return e.FileStore.KeyCursor(key, t, ascending)
}

// CreateIterator returns an iterator for the measurement based on opt.
func (e *Engine) CreateIterator(measurement string, opt influxql.IteratorOptions) (influxql.Iterator, error) {
	if call, ok := opt.Expr.(*influxql.Call); ok {
		if opt.Interval.IsZero() {
			if call.Name == "first" || call.Name == "last" {
				refOpt := opt
				refOpt.Limit = 1
				refOpt.Ascending = call.Name == "first"
				refOpt.Ordered = true
				refOpt.Expr = call.Args[0]

				itrs, err := e.createVarRefIterator(measurement, refOpt)
				if err != nil {
					return nil, err
				}
				return newMergeFinalizerIterator(itrs, opt, e.logger)
			}
		}

		inputs, err := e.createCallIterator(measurement, call, opt)
		if err != nil {
			return nil, err
		} else if len(inputs) == 0 {
			return nil, nil
		}
		return newMergeFinalizerIterator(inputs, opt, e.logger)
	}

	itrs, err := e.createVarRefIterator(measurement, opt)
	if err != nil {
		return nil, err
	}
	return newMergeFinalizerIterator(itrs, opt, e.logger)
}

func (e *Engine) createCallIterator(measurement string, call *influxql.Call, opt influxql.IteratorOptions) ([]influxql.Iterator, error) {
	ref, _ := call.Args[0].(*influxql.VarRef)

	if exists, err := e.index.MeasurementExists([]byte(measurement)); err != nil {
		return nil, err
	} else if !exists {
		return nil, nil
	}

	// Determine tagsets for this measurement based on dimensions and filters.
	tagSets, err := e.index.TagSets([]byte(measurement), opt)
	if err != nil {
		return nil, err
	}

	// Reverse the tag sets if we are ordering by descending.
	if !opt.Ascending {
		for _, t := range tagSets {
			t.Reverse()
		}
	}

	// Calculate tag sets and apply SLIMIT/SOFFSET.
	tagSets = influxql.LimitTagSets(tagSets, opt.SLimit, opt.SOffset)

	itrs := make([]influxql.Iterator, 0, len(tagSets))
	if err := func() error {
		for _, t := range tagSets {
			// Abort if the query was killed
			select {
			case <-opt.InterruptCh:
				influxql.Iterators(itrs).Close()
				return err
			default:
			}

			inputs, err := e.createTagSetIterators(ref, measurement, t, opt)
			if err != nil {
				return err
			} else if len(inputs) == 0 {
				continue
			}

			// Wrap each series in a call iterator.
			for i, input := range inputs {
				if opt.InterruptCh != nil {
					input = influxql.NewInterruptIterator(input, opt.InterruptCh)
				}

				itr, err := influxql.NewCallIterator(input, opt)
				if err != nil {
					influxql.Iterators(inputs).Close()
					return err
				}
				inputs[i] = itr
			}

			itr := influxql.NewParallelMergeIterator(inputs, opt, runtime.GOMAXPROCS(0))
			itrs = append(itrs, itr)
		}
		return nil
	}(); err != nil {
		influxql.Iterators(itrs).Close()
		return nil, err
	}

	return itrs, nil
}

// createVarRefIterator creates an iterator for a variable reference.
func (e *Engine) createVarRefIterator(measurement string, opt influxql.IteratorOptions) ([]influxql.Iterator, error) {
	ref, _ := opt.Expr.(*influxql.VarRef)

	if exists, err := e.index.MeasurementExists([]byte(measurement)); err != nil {
		return nil, err
	} else if !exists {
		return nil, nil
	}

	// Determine tagsets for this measurement based on dimensions and filters.
	tagSets, err := e.index.TagSets([]byte(measurement), opt)
	if err != nil {
		return nil, err
	}

	// Reverse the tag sets if we are ordering by descending.
	if !opt.Ascending {
		for _, t := range tagSets {
			t.Reverse()
		}
	}

	// Calculate tag sets and apply SLIMIT/SOFFSET.
	tagSets = influxql.LimitTagSets(tagSets, opt.SLimit, opt.SOffset)

	itrs := make([]influxql.Iterator, 0, len(tagSets))
	if err := func() error {
		for _, t := range tagSets {
			inputs, err := e.createTagSetIterators(ref, measurement, t, opt)
			if err != nil {
				return err
			} else if len(inputs) == 0 {
				continue
			}

			// If we have a LIMIT or OFFSET and the grouping of the outer query
			// is different than the current grouping, we need to perform the
			// limit on each of the individual series keys instead to improve
			// performance.
			if (opt.Limit > 0 || opt.Offset > 0) && len(opt.Dimensions) != len(opt.GroupBy) {
				for i, input := range inputs {
					inputs[i] = newLimitIterator(input, opt)
				}
			}

			itr, err := influxql.Iterators(inputs).Merge(opt)
			if err != nil {
				influxql.Iterators(inputs).Close()
				return err
			}

			// Apply a limit on the merged iterator.
			if opt.Limit > 0 || opt.Offset > 0 {
				if len(opt.Dimensions) == len(opt.GroupBy) {
					// When the final dimensions and the current grouping are
					// the same, we will only produce one series so we can use
					// the faster limit iterator.
					itr = newLimitIterator(itr, opt)
				} else {
					// When the dimensions are different than the current
					// grouping, we need to account for the possibility there
					// will be multiple series. The limit iterator in the
					// influxql package handles that scenario.
					itr = influxql.NewLimitIterator(itr, opt)
				}
			}
			itrs = append(itrs, itr)
		}
		return nil
	}(); err != nil {
		influxql.Iterators(itrs).Close()
		return nil, err
	}

	return itrs, nil
}

// createTagSetIterators creates a set of iterators for a tagset.
func (e *Engine) createTagSetIterators(ref *influxql.VarRef, name string, t *influxql.TagSet, opt influxql.IteratorOptions) ([]influxql.Iterator, error) {
	// Set parallelism by number of logical cpus.
	parallelism := runtime.GOMAXPROCS(0)
	if parallelism > len(t.SeriesKeys) {
		parallelism = len(t.SeriesKeys)
	}

	// Create series key groupings w/ return error.
	groups := make([]struct {
		keys    []string
		filters []influxql.Expr
		itrs    []influxql.Iterator
		err     error
	}, parallelism)

	// Group series keys.
	n := len(t.SeriesKeys) / parallelism
	for i := 0; i < parallelism; i++ {
		group := &groups[i]

		if i < parallelism-1 {
			group.keys = t.SeriesKeys[i*n : (i+1)*n]
			group.filters = t.Filters[i*n : (i+1)*n]
		} else {
			group.keys = t.SeriesKeys[i*n:]
			group.filters = t.Filters[i*n:]
		}
	}

	// Read series groups in parallel.
	var wg sync.WaitGroup
	for i := range groups {
		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			groups[i].itrs, groups[i].err = e.createTagSetGroupIterators(ref, name, groups[i].keys, t, groups[i].filters, opt)
		}(i)
	}
	wg.Wait()

	// Determine total number of iterators so we can allocate only once.
	var itrN int
	for _, group := range groups {
		itrN += len(group.itrs)
	}

	// Combine all iterators together and check for errors.
	var err error
	itrs := make([]influxql.Iterator, 0, itrN)
	for _, group := range groups {
		if group.err != nil {
			err = group.err
		}
		itrs = append(itrs, group.itrs...)
	}

	// If an error occurred, make sure we close all created iterators.
	if err != nil {
		influxql.Iterators(itrs).Close()
		return nil, err
	}

	return itrs, nil
}

// createTagSetGroupIterators creates a set of iterators for a subset of a tagset's series.
func (e *Engine) createTagSetGroupIterators(ref *influxql.VarRef, name string, seriesKeys []string, t *influxql.TagSet, filters []influxql.Expr, opt influxql.IteratorOptions) ([]influxql.Iterator, error) {
	itrs := make([]influxql.Iterator, 0, len(seriesKeys))
	for i, seriesKey := range seriesKeys {
		var conditionFields []influxql.VarRef
		if filters[i] != nil {
			// Retrieve non-time fields from this series filter and filter out tags.
			conditionFields = influxql.ExprNames(filters[i])
		}

		itr, err := e.createVarRefSeriesIterator(ref, name, seriesKey, t, filters[i], conditionFields, opt)
		if err != nil {
			return itrs, err
		} else if itr == nil {
			continue
		}
		itrs = append(itrs, itr)

		// Abort if the query was killed
		select {
		case <-opt.InterruptCh:
			influxql.Iterators(itrs).Close()
			return nil, err
		default:
		}

		// Enforce series limit at creation time.
		if opt.MaxSeriesN > 0 && len(itrs) > opt.MaxSeriesN {
			influxql.Iterators(itrs).Close()
			return nil, fmt.Errorf("max-select-series limit exceeded: (%d/%d)", len(itrs), opt.MaxSeriesN)
		}

	}
	return itrs, nil
}

// createVarRefSeriesIterator creates an iterator for a variable reference for a series.
func (e *Engine) createVarRefSeriesIterator(ref *influxql.VarRef, name string, seriesKey string, t *influxql.TagSet, filter influxql.Expr, conditionFields []influxql.VarRef, opt influxql.IteratorOptions) (influxql.Iterator, error) {
	_, tfs := models.ParseKey([]byte(seriesKey))
	tags := influxql.NewTags(tfs.Map())

	// Create options specific for this series.
	itrOpt := opt
	itrOpt.Condition = filter

	// Build auxilary cursors.
	// Tag values should be returned if the field doesn't exist.
	var aux []cursorAt
	if len(opt.Aux) > 0 {
		aux = make([]cursorAt, len(opt.Aux))
		for i, ref := range opt.Aux {
			// Create cursor from field if a tag wasn't requested.
			if ref.Type != influxql.Tag {
				cur := e.buildCursor(name, seriesKey, &ref, opt)
				if cur != nil {
					aux[i] = newBufCursor(cur, opt.Ascending)
					continue
				}

				// If a field was requested, use a nil cursor of the requested type.
				switch ref.Type {
				case influxql.Float, influxql.AnyField:
					aux[i] = &floatNilLiteralCursor{}
					continue
				case influxql.Integer:
					aux[i] = &integerNilLiteralCursor{}
					continue
				case influxql.String:
					aux[i] = &stringNilLiteralCursor{}
					continue
				case influxql.Boolean:
					aux[i] = &booleanNilLiteralCursor{}
					continue
				}
			}

			// If field doesn't exist, use the tag value.
			if v := tags.Value(ref.Val); v == "" {
				// However, if the tag value is blank then return a null.
				aux[i] = &stringNilLiteralCursor{}
			} else {
				aux[i] = &stringLiteralCursor{value: v}
			}
		}
	}

	// Build conditional field cursors.
	// If a conditional field doesn't exist then ignore the series.
	var conds []cursorAt
	if len(conditionFields) > 0 {
		conds = make([]cursorAt, len(conditionFields))
		for i, ref := range conditionFields {
			// Create cursor from field if a tag wasn't requested.
			if ref.Type != influxql.Tag {
				cur := e.buildCursor(name, seriesKey, &ref, opt)
				if cur != nil {
					conds[i] = newBufCursor(cur, opt.Ascending)
					continue
				}

				// If a field was requested, use a nil cursor of the requested type.
				switch ref.Type {
				case influxql.Float, influxql.AnyField:
					conds[i] = &floatNilLiteralCursor{}
					continue
				case influxql.Integer:
					conds[i] = &integerNilLiteralCursor{}
					continue
				case influxql.String:
					conds[i] = &stringNilLiteralCursor{}
					continue
				case influxql.Boolean:
					conds[i] = &booleanNilLiteralCursor{}
					continue
				}
			}

			// If field doesn't exist, use the tag value.
			if v := tags.Value(ref.Val); v == "" {
				// However, if the tag value is blank then return a null.
				conds[i] = &stringNilLiteralCursor{}
			} else {
				conds[i] = &stringLiteralCursor{value: v}
			}
		}
	}
	condNames := influxql.VarRefs(conditionFields).Strings()

	// Limit tags to only the dimensions selected.
	dimensions := opt.GetDimensions()
	tags = tags.Subset(dimensions)

	// If it's only auxiliary fields then it doesn't matter what type of iterator we use.
	if ref == nil {
		return newFloatIterator(name, tags, itrOpt, nil, aux, conds, condNames), nil
	}

	// Build main cursor.
	cur := e.buildCursor(name, seriesKey, ref, opt)

	// If the field doesn't exist then don't build an iterator.
	if cur == nil {
		cursorsAt(aux).close()
		cursorsAt(conds).close()
		return nil, nil
	}

	switch cur := cur.(type) {
	case floatCursor:
		return newFloatIterator(name, tags, itrOpt, cur, aux, conds, condNames), nil
	case integerCursor:
		return newIntegerIterator(name, tags, itrOpt, cur, aux, conds, condNames), nil
	case stringCursor:
		return newStringIterator(name, tags, itrOpt, cur, aux, conds, condNames), nil
	case booleanCursor:
		return newBooleanIterator(name, tags, itrOpt, cur, aux, conds, condNames), nil
	default:
		panic("unreachable")
	}
}

// buildCursor creates an untyped cursor for a field.
func (e *Engine) buildCursor(measurement, seriesKey string, ref *influxql.VarRef, opt influxql.IteratorOptions) cursor {
	// Look up fields for measurement.
	mf := e.fieldset.Fields(measurement)
	if mf == nil {
		return nil
	}

	// Find individual field.
	f := mf.Field(ref.Val)
	if f == nil {
		return nil
	}

	// Check if we need to perform a cast. Performing a cast in the
	// engine (if it is possible) is much more efficient than an automatic cast.
	if ref.Type != influxql.Unknown && ref.Type != influxql.AnyField && ref.Type != f.Type {
		switch ref.Type {
		case influxql.Float:
			switch f.Type {
			case influxql.Integer:
				cur := e.buildIntegerCursor(measurement, seriesKey, ref.Val, opt)
				return &floatCastIntegerCursor{cursor: cur}
			}
		case influxql.Integer:
			switch f.Type {
			case influxql.Float:
				cur := e.buildFloatCursor(measurement, seriesKey, ref.Val, opt)
				return &integerCastFloatCursor{cursor: cur}
			}
		}
		return nil
	}

	// Return appropriate cursor based on type.
	switch f.Type {
	case influxql.Float:
		return e.buildFloatCursor(measurement, seriesKey, ref.Val, opt)
	case influxql.Integer:
		return e.buildIntegerCursor(measurement, seriesKey, ref.Val, opt)
	case influxql.String:
		return e.buildStringCursor(measurement, seriesKey, ref.Val, opt)
	case influxql.Boolean:
		return e.buildBooleanCursor(measurement, seriesKey, ref.Val, opt)
	default:
		panic("unreachable")
	}
}

// buildFloatCursor creates a cursor for a float field.
func (e *Engine) buildFloatCursor(measurement, seriesKey, field string, opt influxql.IteratorOptions) floatCursor {
	cacheValues := e.Cache.Values(SeriesFieldKey(seriesKey, field))
	keyCursor := e.KeyCursor(SeriesFieldKey(seriesKey, field), opt.SeekTime(), opt.Ascending)
	return newFloatCursor(opt.SeekTime(), opt.Ascending, cacheValues, keyCursor)
}

// buildIntegerCursor creates a cursor for an integer field.
func (e *Engine) buildIntegerCursor(measurement, seriesKey, field string, opt influxql.IteratorOptions) integerCursor {
	cacheValues := e.Cache.Values(SeriesFieldKey(seriesKey, field))
	keyCursor := e.KeyCursor(SeriesFieldKey(seriesKey, field), opt.SeekTime(), opt.Ascending)
	return newIntegerCursor(opt.SeekTime(), opt.Ascending, cacheValues, keyCursor)
}

// buildStringCursor creates a cursor for a string field.
func (e *Engine) buildStringCursor(measurement, seriesKey, field string, opt influxql.IteratorOptions) stringCursor {
	cacheValues := e.Cache.Values(SeriesFieldKey(seriesKey, field))
	keyCursor := e.KeyCursor(SeriesFieldKey(seriesKey, field), opt.SeekTime(), opt.Ascending)
	return newStringCursor(opt.SeekTime(), opt.Ascending, cacheValues, keyCursor)
}

// buildBooleanCursor creates a cursor for a boolean field.
func (e *Engine) buildBooleanCursor(measurement, seriesKey, field string, opt influxql.IteratorOptions) booleanCursor {
	cacheValues := e.Cache.Values(SeriesFieldKey(seriesKey, field))
	keyCursor := e.KeyCursor(SeriesFieldKey(seriesKey, field), opt.SeekTime(), opt.Ascending)
	return newBooleanCursor(opt.SeekTime(), opt.Ascending, cacheValues, keyCursor)
}

func (e *Engine) SeriesPointIterator(opt influxql.IteratorOptions) (influxql.Iterator, error) {
	return e.index.SeriesPointIterator(opt)
}

// SeriesFieldKey combine a series key and field name for a unique string to be hashed to a numeric ID.
func SeriesFieldKey(seriesKey, field string) string {
	return seriesKey + keyFieldSeparator + field
}

func tsmFieldTypeToInfluxQLDataType(typ byte) (influxql.DataType, error) {
	switch typ {
	case BlockFloat64:
		return influxql.Float, nil
	case BlockInteger:
		return influxql.Integer, nil
	case BlockBoolean:
		return influxql.Boolean, nil
	case BlockString:
		return influxql.String, nil
	default:
		return influxql.Unknown, fmt.Errorf("unknown block type: %v", typ)
	}
}

// SeriesAndFieldFromCompositeKey returns the series key and the field key extracted from the composite key.
func SeriesAndFieldFromCompositeKey(key []byte) ([]byte, []byte) {
	sep := bytes.Index(key, keyFieldSeparatorBytes)
	if sep == -1 {
		// No field???
		return key, nil
	}
	return key[:sep], key[sep+len(keyFieldSeparator):]
}

// readDir recursively reads all files from a path.
func readDir(root, rel string) ([]string, error) {
	// Open root.
	f, err := os.Open(filepath.Join(root, rel))
	if err != nil {
		return nil, err
	}
	defer f.Close()

	// Read all files.
	fis, err := f.Readdir(-1)
	if err != nil {
		return nil, err
	}

	// Read all subdirectories and append to the end.
	var paths []string
	for _, fi := range fis {
		// Simply append if it's a file.
		if !fi.IsDir() {
			paths = append(paths, filepath.Join(rel, fi.Name()))
			continue
		}

		// Read and append nested file paths.
		children, err := readDir(root, filepath.Join(rel, fi.Name()))
		if err != nil {
			return nil, err
		}
		paths = append(paths, children...)
	}
	return paths, nil
}