1
0
mirror of https://github.com/Oxalide/vsphere-influxdb-go.git synced 2023-10-10 11:36:51 +00:00

add vendoring with go dep

This commit is contained in:
Adrian Todorov
2017-10-25 20:52:40 +00:00
parent 704f4d20d1
commit a59409f16b
1627 changed files with 489673 additions and 0 deletions

91
vendor/github.com/influxdata/influxdb/tsdb/README.md generated vendored Normal file
View File

@@ -0,0 +1,91 @@
# Line Protocol
The line protocol is a text based format for writing points to InfluxDB. Each line defines a single point.
Multiple lines must be separated by the newline character `\n`. The format of the line consists of three parts:
```
[key] [fields] [timestamp]
```
Each section is separated by spaces. The minimum required point consists of a measurement name and at least one field. Points without a specified timestamp will be written using the server's local timestamp. Timestamps are assumed to be in nanoseconds unless a `precision` value is passed in the query string.
## Key
The key is the measurement name and any optional tags separated by commas. Measurement names, tag keys, and tag values must escape any spaces or commas using a backslash (`\`). For example: `\ ` and `\,`. All tag values are stored as strings and should not be surrounded in quotes.
Tags should be sorted by key before being sent for best performance. The sort should match that from the Go `bytes.Compare` function (http://golang.org/pkg/bytes/#Compare).
### Examples
```
# measurement only
cpu
# measurement and tags
cpu,host=serverA,region=us-west
# measurement with commas
cpu\,01,host=serverA,region=us-west
# tag value with spaces
cpu,host=server\ A,region=us\ west
```
## Fields
Fields are key-value metrics associated with the measurement. Every line must have at least one field. Multiple fields must be separated with commas and not spaces.
Field keys are always strings and follow the same syntactical rules as described above for tag keys and values. Field values can be one of four types. The first value written for a given field on a given measurement defines the type of that field for all series under that measurement.
* _integer_ - Numeric values that do not include a decimal and are followed by a trailing i when inserted (e.g. 1i, 345i, 2015i, -10i). Note that all values must have a trailing i. If they do not they will be written as floats.
* _float_ - Numeric values that are not followed by a trailing i. (e.g. 1, 1.0, -3.14, 6.0+e5, 10).
* _boolean_ - A value indicating true or false. Valid boolean strings are (t, T, true, TRUE, f, F, false, and FALSE).
* _string_ - A text value. All string values _must_ be surrounded in double-quotes `"`. If the string contains
a double-quote or backslashes, it must be escaped with a backslash, e.g. `\"`, `\\`.
```
# integer value
cpu value=1i
cpu value=1.1i # will result in a parse error
# float value
cpu_load value=1
cpu_load value=1.0
cpu_load value=1.2
# boolean value
error fatal=true
# string value
event msg="logged out"
# multiple values
cpu load=10,alert=true,reason="value above maximum threshold"
```
## Timestamp
The timestamp section is optional but should be specified if possible. The value is an integer representing nanoseconds since the epoch. If the timestamp is not provided the point will inherit the server's local timestamp.
Some write APIs allow passing a lower precision. If the API supports a lower precision, the timestamp may also be
an integer epoch in microseconds, milliseconds, seconds, minutes or hours.
## Full Example
A full example is shown below.
```
cpu,host=server01,region=uswest value=1 1434055562000000000
cpu,host=server02,region=uswest value=3 1434055562000010000
```
In this example the first line shows a `measurement` of "cpu", there are two tags "host" and "region, the `value` is 1.0, and the `timestamp` is 1434055562000000000. Following this is a second line, also a point in the `measurement` "cpu" but belonging to a different "host".
```
cpu,host=server\ 01,region=uswest value=1,msg="all systems nominal"
cpu,host=server\ 01,region=us\,west value_int=1i
```
In these examples, the "host" is set to `server 01`. The field value associated with field key `msg` is double-quoted, as it is a string. The second example shows a region of `us,west` with the comma properly escaped. In the first example `value` is written as a floating point number. In the second, `value_int` is an integer.
# Distributed Queries

149
vendor/github.com/influxdata/influxdb/tsdb/batcher.go generated vendored Normal file
View File

@@ -0,0 +1,149 @@
package tsdb
import (
"sync"
"sync/atomic"
"time"
"github.com/influxdata/influxdb/models"
)
// PointBatcher accepts Points and will emit a batch of those points when either
// a) the batch reaches a certain size, or b) a certain time passes.
type PointBatcher struct {
stats PointBatcherStats
size int
duration time.Duration
stop chan struct{}
in chan models.Point
out chan []models.Point
flush chan struct{}
wg *sync.WaitGroup
}
// NewPointBatcher returns a new PointBatcher. sz is the batching size,
// bp is the maximum number of batches that may be pending. d is the time
// after which a batch will be emitted after the first point is received
// for the batch, regardless of its size.
func NewPointBatcher(sz int, bp int, d time.Duration) *PointBatcher {
return &PointBatcher{
size: sz,
duration: d,
stop: make(chan struct{}),
in: make(chan models.Point, bp*sz),
out: make(chan []models.Point),
flush: make(chan struct{}),
}
}
// PointBatcherStats are the statistics each batcher tracks.
type PointBatcherStats struct {
BatchTotal uint64 // Total count of batches transmitted.
PointTotal uint64 // Total count of points processed.
SizeTotal uint64 // Number of batches that reached size threshold.
TimeoutTotal uint64 // Number of timeouts that occurred.
}
// Start starts the batching process. Returns the in and out channels for points
// and point-batches respectively.
func (b *PointBatcher) Start() {
// Already running?
if b.wg != nil {
return
}
var timer *time.Timer
var batch []models.Point
var timerCh <-chan time.Time
emit := func() {
b.out <- batch
atomic.AddUint64(&b.stats.BatchTotal, 1)
batch = nil
}
b.wg = &sync.WaitGroup{}
b.wg.Add(1)
go func() {
defer b.wg.Done()
for {
select {
case <-b.stop:
if len(batch) > 0 {
emit()
timerCh = nil
}
return
case p := <-b.in:
atomic.AddUint64(&b.stats.PointTotal, 1)
if batch == nil {
batch = make([]models.Point, 0, b.size)
if b.duration > 0 {
timer = time.NewTimer(b.duration)
timerCh = timer.C
}
}
batch = append(batch, p)
if len(batch) >= b.size { // 0 means send immediately.
atomic.AddUint64(&b.stats.SizeTotal, 1)
emit()
timerCh = nil
}
case <-b.flush:
if len(batch) > 0 {
emit()
timerCh = nil
}
case <-timerCh:
atomic.AddUint64(&b.stats.TimeoutTotal, 1)
emit()
}
}
}()
}
// Stop stops the batching process. Stop waits for the batching routine
// to stop before returning.
func (b *PointBatcher) Stop() {
// If not running, nothing to stop.
if b.wg == nil {
return
}
close(b.stop)
b.wg.Wait()
}
// In returns the channel to which points should be written.
func (b *PointBatcher) In() chan<- models.Point {
return b.in
}
// Out returns the channel from which batches should be read.
func (b *PointBatcher) Out() <-chan []models.Point {
return b.out
}
// Flush instructs the batcher to emit any pending points in a batch, regardless of batch size.
// If there are no pending points, no batch is emitted.
func (b *PointBatcher) Flush() {
b.flush <- struct{}{}
}
// Stats returns a PointBatcherStats object for the PointBatcher. While the each statistic should be
// closely correlated with each other statistic, it is not guaranteed.
func (b *PointBatcher) Stats() *PointBatcherStats {
stats := PointBatcherStats{}
stats.BatchTotal = atomic.LoadUint64(&b.stats.BatchTotal)
stats.PointTotal = atomic.LoadUint64(&b.stats.PointTotal)
stats.SizeTotal = atomic.LoadUint64(&b.stats.SizeTotal)
stats.TimeoutTotal = atomic.LoadUint64(&b.stats.TimeoutTotal)
return &stats
}

View File

@@ -0,0 +1,146 @@
package tsdb_test
import (
"testing"
"time"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb"
)
// TestBatch_Size ensures that a batcher generates a batch when the size threshold is reached.
func TestBatch_Size(t *testing.T) {
batchSize := 5
batcher := tsdb.NewPointBatcher(batchSize, 0, time.Hour)
if batcher == nil {
t.Fatal("failed to create batcher for size test")
}
batcher.Start()
var p models.Point
go func() {
for i := 0; i < batchSize; i++ {
batcher.In() <- p
}
}()
batch := <-batcher.Out()
if len(batch) != batchSize {
t.Errorf("received batch has incorrect length exp %d, got %d", batchSize, len(batch))
}
checkPointBatcherStats(t, batcher, -1, batchSize, 1, 0)
}
// TestBatch_Size ensures that a buffered batcher generates a batch when the size threshold is reached.
func TestBatch_SizeBuffered(t *testing.T) {
batchSize := 5
batcher := tsdb.NewPointBatcher(batchSize, 5, time.Hour)
if batcher == nil {
t.Fatal("failed to create batcher for size test")
}
batcher.Start()
var p models.Point
go func() {
for i := 0; i < batchSize; i++ {
batcher.In() <- p
}
}()
batch := <-batcher.Out()
if len(batch) != batchSize {
t.Errorf("received batch has incorrect length exp %d, got %d", batchSize, len(batch))
}
checkPointBatcherStats(t, batcher, -1, batchSize, 1, 0)
}
// TestBatch_Size ensures that a batcher generates a batch when the timeout triggers.
func TestBatch_Timeout(t *testing.T) {
batchSize := 5
batcher := tsdb.NewPointBatcher(batchSize+1, 0, 100*time.Millisecond)
if batcher == nil {
t.Fatal("failed to create batcher for timeout test")
}
batcher.Start()
var p models.Point
go func() {
for i := 0; i < batchSize; i++ {
batcher.In() <- p
}
}()
batch := <-batcher.Out()
if len(batch) != batchSize {
t.Errorf("received batch has incorrect length exp %d, got %d", batchSize, len(batch))
}
checkPointBatcherStats(t, batcher, -1, batchSize, 0, 1)
}
// TestBatch_Flush ensures that a batcher generates a batch when flushed
func TestBatch_Flush(t *testing.T) {
batchSize := 2
batcher := tsdb.NewPointBatcher(batchSize, 0, time.Hour)
if batcher == nil {
t.Fatal("failed to create batcher for flush test")
}
batcher.Start()
var p models.Point
go func() {
batcher.In() <- p
batcher.Flush()
}()
batch := <-batcher.Out()
if len(batch) != 1 {
t.Errorf("received batch has incorrect length exp %d, got %d", 1, len(batch))
}
checkPointBatcherStats(t, batcher, -1, 1, 0, 0)
}
// TestBatch_MultipleBatches ensures that a batcher correctly processes multiple batches.
func TestBatch_MultipleBatches(t *testing.T) {
batchSize := 2
batcher := tsdb.NewPointBatcher(batchSize, 0, 100*time.Millisecond)
if batcher == nil {
t.Fatal("failed to create batcher for size test")
}
batcher.Start()
var p models.Point
var b []models.Point
batcher.In() <- p
batcher.In() <- p
b = <-batcher.Out() // Batch threshold reached.
if len(b) != batchSize {
t.Errorf("received batch (size) has incorrect length exp %d, got %d", batchSize, len(b))
}
batcher.In() <- p
b = <-batcher.Out() // Timeout triggered.
if len(b) != 1 {
t.Errorf("received batch (timeout) has incorrect length exp %d, got %d", 1, len(b))
}
checkPointBatcherStats(t, batcher, -1, 3, 1, 1)
}
func checkPointBatcherStats(t *testing.T, b *tsdb.PointBatcher, batchTotal, pointTotal, sizeTotal, timeoutTotal int) {
stats := b.Stats()
if batchTotal != -1 && stats.BatchTotal != uint64(batchTotal) {
t.Errorf("batch total stat is incorrect: %d", stats.BatchTotal)
}
if pointTotal != -1 && stats.PointTotal != uint64(pointTotal) {
t.Errorf("point total stat is incorrect: %d", stats.PointTotal)
}
if sizeTotal != -1 && stats.SizeTotal != uint64(sizeTotal) {
t.Errorf("size total stat is incorrect: %d", stats.SizeTotal)
}
if timeoutTotal != -1 && stats.TimeoutTotal != uint64(timeoutTotal) {
t.Errorf("timeout total stat is incorrect: %d", stats.TimeoutTotal)
}
}

172
vendor/github.com/influxdata/influxdb/tsdb/config.go generated vendored Normal file
View File

@@ -0,0 +1,172 @@
package tsdb
import (
"errors"
"fmt"
"time"
"github.com/influxdata/influxdb/monitor/diagnostics"
"github.com/influxdata/influxdb/toml"
)
const (
// DefaultEngine is the default engine for new shards
DefaultEngine = "tsm1"
// DefaultIndex is the default index for new shards
DefaultIndex = "inmem"
// tsdb/engine/wal configuration options
// Default settings for TSM
// DefaultCacheMaxMemorySize is the maximum size a shard's cache can
// reach before it starts rejecting writes.
DefaultCacheMaxMemorySize = 1024 * 1024 * 1024 // 1GB
// DefaultCacheSnapshotMemorySize is the size at which the engine will
// snapshot the cache and write it to a TSM file, freeing up memory
DefaultCacheSnapshotMemorySize = 25 * 1024 * 1024 // 25MB
// DefaultCacheSnapshotWriteColdDuration is the length of time at which
// the engine will snapshot the cache and write it to a new TSM file if
// the shard hasn't received writes or deletes
DefaultCacheSnapshotWriteColdDuration = time.Duration(10 * time.Minute)
// DefaultCompactFullWriteColdDuration is the duration at which the engine
// will compact all TSM files in a shard if it hasn't received a write or delete
DefaultCompactFullWriteColdDuration = time.Duration(4 * time.Hour)
// DefaultMaxPointsPerBlock is the maximum number of points in an encoded
// block in a TSM file
DefaultMaxPointsPerBlock = 1000
// DefaultMaxSeriesPerDatabase is the maximum number of series a node can hold per database.
// This limit only applies to the "inmem" index.
DefaultMaxSeriesPerDatabase = 1000000
// DefaultMaxValuesPerTag is the maximum number of values a tag can have within a measurement.
DefaultMaxValuesPerTag = 100000
// DefaultMaxConcurrentCompactions is the maximum number of concurrent full and level compactions
// that can run at one time. A value of results in runtime.GOMAXPROCS(0) used at runtime.
DefaultMaxConcurrentCompactions = 0
)
// Config holds the configuration for the tsbd package.
type Config struct {
Dir string `toml:"dir"`
Engine string `toml:"-"`
Index string `toml:"index-version"`
// General WAL configuration options
WALDir string `toml:"wal-dir"`
// WALFsyncDelay is the amount of time that a write will wait before fsyncing. A duration
// greater than 0 can be used to batch up multiple fsync calls. This is useful for slower
// disks or when WAL write contention is seen. A value of 0 fsyncs every write to the WAL.
WALFsyncDelay toml.Duration `toml:"wal-fsync-delay"`
// Query logging
QueryLogEnabled bool `toml:"query-log-enabled"`
// Compaction options for tsm1 (descriptions above with defaults)
CacheMaxMemorySize uint64 `toml:"cache-max-memory-size"`
CacheSnapshotMemorySize uint64 `toml:"cache-snapshot-memory-size"`
CacheSnapshotWriteColdDuration toml.Duration `toml:"cache-snapshot-write-cold-duration"`
CompactFullWriteColdDuration toml.Duration `toml:"compact-full-write-cold-duration"`
// Limits
// MaxSeriesPerDatabase is the maximum number of series a node can hold per database.
// When this limit is exceeded, writes return a 'max series per database exceeded' error.
// A value of 0 disables the limit. This limit only applies when using the "inmem" index.
MaxSeriesPerDatabase int `toml:"max-series-per-database"`
// MaxValuesPerTag is the maximum number of tag values a single tag key can have within
// a measurement. When the limit is execeeded, writes return an error.
// A value of 0 disables the limit.
MaxValuesPerTag int `toml:"max-values-per-tag"`
// MaxConcurrentCompactions is the maximum number of concurrent level and full compactions
// that can be running at one time across all shards. Compactions scheduled to run when the
// limit is reached are blocked until a running compaction completes. Snapshot compactions are
// not affected by this limit. A value of 0 limits compactions to runtime.GOMAXPROCS(0).
MaxConcurrentCompactions int `toml:"max-concurrent-compactions"`
TraceLoggingEnabled bool `toml:"trace-logging-enabled"`
}
// NewConfig returns the default configuration for tsdb.
func NewConfig() Config {
return Config{
Engine: DefaultEngine,
Index: DefaultIndex,
QueryLogEnabled: true,
CacheMaxMemorySize: DefaultCacheMaxMemorySize,
CacheSnapshotMemorySize: DefaultCacheSnapshotMemorySize,
CacheSnapshotWriteColdDuration: toml.Duration(DefaultCacheSnapshotWriteColdDuration),
CompactFullWriteColdDuration: toml.Duration(DefaultCompactFullWriteColdDuration),
MaxSeriesPerDatabase: DefaultMaxSeriesPerDatabase,
MaxValuesPerTag: DefaultMaxValuesPerTag,
MaxConcurrentCompactions: DefaultMaxConcurrentCompactions,
TraceLoggingEnabled: false,
}
}
// Validate validates the configuration hold by c.
func (c *Config) Validate() error {
if c.Dir == "" {
return errors.New("Data.Dir must be specified")
} else if c.WALDir == "" {
return errors.New("Data.WALDir must be specified")
}
if c.MaxConcurrentCompactions < 0 {
return errors.New("max-concurrent-compactions must be greater than 0")
}
valid := false
for _, e := range RegisteredEngines() {
if e == c.Engine {
valid = true
break
}
}
if !valid {
return fmt.Errorf("unrecognized engine %s", c.Engine)
}
valid = false
for _, e := range RegisteredIndexes() {
if e == c.Index {
valid = true
break
}
}
if !valid {
return fmt.Errorf("unrecognized index %s", c.Index)
}
return nil
}
// Diagnostics returns a diagnostics representation of a subset of the Config.
func (c Config) Diagnostics() (*diagnostics.Diagnostics, error) {
return diagnostics.RowFromMap(map[string]interface{}{
"dir": c.Dir,
"wal-dir": c.WALDir,
"wal-fsync-delay": c.WALFsyncDelay,
"cache-max-memory-size": c.CacheMaxMemorySize,
"cache-snapshot-memory-size": c.CacheSnapshotMemorySize,
"cache-snapshot-write-cold-duration": c.CacheSnapshotWriteColdDuration,
"compact-full-write-cold-duration": c.CompactFullWriteColdDuration,
"max-series-per-database": c.MaxSeriesPerDatabase,
"max-values-per-tag": c.MaxValuesPerTag,
"max-concurrent-compactions": c.MaxConcurrentCompactions,
}), nil
}

View File

@@ -0,0 +1,70 @@
package tsdb_test
import (
"testing"
"time"
"github.com/BurntSushi/toml"
"github.com/influxdata/influxdb/tsdb"
)
func TestConfig_Parse(t *testing.T) {
// Parse configuration.
c := tsdb.NewConfig()
if _, err := toml.Decode(`
dir = "/var/lib/influxdb/data"
wal-dir = "/var/lib/influxdb/wal"
wal-fsync-delay = "10s"
`, &c); err != nil {
t.Fatal(err)
}
if err := c.Validate(); err != nil {
t.Errorf("unexpected validate error: %s", err)
}
if got, exp := c.Dir, "/var/lib/influxdb/data"; got != exp {
t.Errorf("unexpected dir:\n\nexp=%v\n\ngot=%v\n\n", exp, got)
}
if got, exp := c.WALDir, "/var/lib/influxdb/wal"; got != exp {
t.Errorf("unexpected wal-dir:\n\nexp=%v\n\ngot=%v\n\n", exp, got)
}
if got, exp := c.WALFsyncDelay, time.Duration(10*time.Second); time.Duration(got).Nanoseconds() != exp.Nanoseconds() {
t.Errorf("unexpected wal-fsync-delay:\n\nexp=%v\n\ngot=%v\n\n", exp, got)
}
}
func TestConfig_Validate_Error(t *testing.T) {
c := tsdb.NewConfig()
if err := c.Validate(); err == nil || err.Error() != "Data.Dir must be specified" {
t.Errorf("unexpected error: %s", err)
}
c.Dir = "/var/lib/influxdb/data"
if err := c.Validate(); err == nil || err.Error() != "Data.WALDir must be specified" {
t.Errorf("unexpected error: %s", err)
}
c.WALDir = "/var/lib/influxdb/wal"
c.Engine = "fake1"
if err := c.Validate(); err == nil || err.Error() != "unrecognized engine fake1" {
t.Errorf("unexpected error: %s", err)
}
c.Engine = "tsm1"
c.Index = "foo"
if err := c.Validate(); err == nil || err.Error() != "unrecognized index foo" {
t.Errorf("unexpected error: %s", err)
}
c.Index = "inmem"
if err := c.Validate(); err != nil {
t.Error(err)
}
c.Index = "tsi1"
if err := c.Validate(); err != nil {
t.Error(err)
}
}

13
vendor/github.com/influxdata/influxdb/tsdb/cursor.go generated vendored Normal file
View File

@@ -0,0 +1,13 @@
package tsdb
import "github.com/influxdata/influxdb/influxql"
// EOF represents a "not found" key returned by a Cursor.
const EOF = influxql.ZeroTime
// Cursor represents an iterator over a series.
type Cursor interface {
SeekTo(seek int64) (key int64, value interface{})
Next() (key int64, value interface{})
Ascending() bool
}

5
vendor/github.com/influxdata/influxdb/tsdb/doc.go generated vendored Normal file
View File

@@ -0,0 +1,5 @@
/*
Package tsdb implements a durable time series database.
*/
package tsdb

164
vendor/github.com/influxdata/influxdb/tsdb/engine.go generated vendored Normal file
View File

@@ -0,0 +1,164 @@
package tsdb
import (
"errors"
"fmt"
"io"
"os"
"regexp"
"sort"
"time"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/pkg/limiter"
"github.com/uber-go/zap"
)
var (
// ErrFormatNotFound is returned when no format can be determined from a path.
ErrFormatNotFound = errors.New("format not found")
// ErrUnknownEngineFormat is returned when the engine format is
// unknown. ErrUnknownEngineFormat is currently returned if a format
// other than tsm1 is encountered.
ErrUnknownEngineFormat = errors.New("unknown engine format")
)
// Engine represents a swappable storage engine for the shard.
type Engine interface {
Open() error
Close() error
SetEnabled(enabled bool)
SetCompactionsEnabled(enabled bool)
WithLogger(zap.Logger)
LoadMetadataIndex(shardID uint64, index Index) error
CreateSnapshot() (string, error)
Backup(w io.Writer, basePath string, since time.Time) error
Restore(r io.Reader, basePath string) error
Import(r io.Reader, basePath string) error
CreateIterator(measurement string, opt influxql.IteratorOptions) (influxql.Iterator, error)
WritePoints(points []models.Point) error
CreateSeriesIfNotExists(key, name []byte, tags models.Tags) error
CreateSeriesListIfNotExists(keys, names [][]byte, tags []models.Tags) error
DeleteSeriesRange(keys [][]byte, min, max int64) error
SeriesSketches() (estimator.Sketch, estimator.Sketch, error)
MeasurementsSketches() (estimator.Sketch, estimator.Sketch, error)
SeriesN() int64
MeasurementExists(name []byte) (bool, error)
MeasurementNamesByExpr(expr influxql.Expr) ([][]byte, error)
MeasurementNamesByRegex(re *regexp.Regexp) ([][]byte, error)
MeasurementFields(measurement []byte) *MeasurementFields
ForEachMeasurementName(fn func(name []byte) error) error
DeleteMeasurement(name []byte) error
// TagKeys(name []byte) ([][]byte, error)
HasTagKey(name, key []byte) (bool, error)
MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error)
MeasurementTagKeyValuesByExpr(name []byte, key []string, expr influxql.Expr, keysSorted bool) ([][]string, error)
ForEachMeasurementTagKey(name []byte, fn func(key []byte) error) error
TagKeyCardinality(name, key []byte) int
// InfluxQL iterators
MeasurementSeriesKeysByExpr(name []byte, condition influxql.Expr) ([][]byte, error)
ForEachMeasurementSeriesByExpr(name []byte, expr influxql.Expr, fn func(tags models.Tags) error) error
SeriesPointIterator(opt influxql.IteratorOptions) (influxql.Iterator, error)
// Statistics will return statistics relevant to this engine.
Statistics(tags map[string]string) []models.Statistic
LastModified() time.Time
DiskSize() int64
IsIdle() bool
io.WriterTo
}
// EngineFormat represents the format for an engine.
type EngineFormat int
const (
// TSM1Format is the format used by the tsm1 engine.
TSM1Format EngineFormat = 2
)
// NewEngineFunc creates a new engine.
type NewEngineFunc func(id uint64, i Index, database, path string, walPath string, options EngineOptions) Engine
// newEngineFuncs is a lookup of engine constructors by name.
var newEngineFuncs = make(map[string]NewEngineFunc)
// RegisterEngine registers a storage engine initializer by name.
func RegisterEngine(name string, fn NewEngineFunc) {
if _, ok := newEngineFuncs[name]; ok {
panic("engine already registered: " + name)
}
newEngineFuncs[name] = fn
}
// RegisteredEngines returns the slice of currently registered engines.
func RegisteredEngines() []string {
a := make([]string, 0, len(newEngineFuncs))
for k := range newEngineFuncs {
a = append(a, k)
}
sort.Strings(a)
return a
}
// NewEngine returns an instance of an engine based on its format.
// If the path does not exist then the DefaultFormat is used.
func NewEngine(id uint64, i Index, database, path string, walPath string, options EngineOptions) (Engine, error) {
// Create a new engine
if _, err := os.Stat(path); os.IsNotExist(err) {
return newEngineFuncs[options.EngineVersion](id, i, database, path, walPath, options), nil
}
// If it's a dir then it's a tsm1 engine
format := DefaultEngine
if fi, err := os.Stat(path); err != nil {
return nil, err
} else if !fi.Mode().IsDir() {
return nil, ErrUnknownEngineFormat
} else {
format = "tsm1"
}
// Lookup engine by format.
fn := newEngineFuncs[format]
if fn == nil {
return nil, fmt.Errorf("invalid engine format: %q", format)
}
return fn(id, i, database, path, walPath, options), nil
}
// EngineOptions represents the options used to initialize the engine.
type EngineOptions struct {
EngineVersion string
IndexVersion string
ShardID uint64
InmemIndex interface{} // shared in-memory index
CompactionLimiter limiter.Fixed
Config Config
}
// NewEngineOptions returns the default options.
func NewEngineOptions() EngineOptions {
return EngineOptions{
EngineVersion: DefaultEngine,
IndexVersion: DefaultIndex,
Config: NewConfig(),
}
}
// NewInmemIndex returns a new "inmem" index type.
var NewInmemIndex func(name string) (interface{}, error)

View File

@@ -0,0 +1,9 @@
// Package engine can be imported to initialize and register all available TSDB engines.
//
// Alternatively, you can import any individual subpackage underneath engine.
package engine // import "github.com/influxdata/influxdb/tsdb/engine"
import (
// Initialize and register tsm1 engine
_ "github.com/influxdata/influxdb/tsdb/engine/tsm1"
)

View File

@@ -0,0 +1,451 @@
# File Structure
A TSM file is composed for four sections: header, blocks, index and the footer.
```
┌────────┬────────────────────────────────────┬─────────────┬──────────────┐
│ Header │ Blocks │ Index │ Footer │
│5 bytes │ N bytes │ N bytes │ 4 bytes │
└────────┴────────────────────────────────────┴─────────────┴──────────────┘
```
Header is composed of a magic number to identify the file type and a version number.
```
┌───────────────────┐
│ Header │
├─────────┬─────────┤
│ Magic │ Version │
│ 4 bytes │ 1 byte │
└─────────┴─────────┘
```
Blocks are sequences of block CRC32 and data. The block data is opaque to the file. The CRC32 is used for recovery to ensure blocks have not been corrupted due to bugs outside of our control. The length of the blocks is stored in the index.
```
┌───────────────────────────────────────────────────────────┐
│ Blocks │
├───────────────────┬───────────────────┬───────────────────┤
│ Block 1 │ Block 2 │ Block N │
├─────────┬─────────┼─────────┬─────────┼─────────┬─────────┤
│ CRC │ Data │ CRC │ Data │ CRC │ Data │
│ 4 bytes │ N bytes │ 4 bytes │ N bytes │ 4 bytes │ N bytes │
└─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
```
Following the blocks is the index for the blocks in the file. The index is composed of a sequence of index entries ordered lexicographically by key and then by time. Each index entry starts with a key length and key followed by a count of the number of blocks in the file. Each block entry is composed of the min and max time for the block, the offset into the file where the block is located and the size of the block.
The index structure can provide efficient access to all blocks as well as the ability to determine the cost associated with accessing a given key. Given a key and timestamp, we know exactly which file contains the block for that timestamp as well as where that block resides and how much data to read to retrieve the block. If we know we need to read all or multiple blocks in a file, we can use the size to determine how much to read in a given IO.
_TBD: The block length stored in the block data could probably be dropped since we store it in the index._
```
┌────────────────────────────────────────────────────────────────────────────┐
│ Index │
├─────────┬─────────┬──────┬───────┬─────────┬─────────┬────────┬────────┬───┤
│ Key Len │ Key │ Type │ Count │Min Time │Max Time │ Offset │ Size │...│
│ 2 bytes │ N bytes │1 byte│2 bytes│ 8 bytes │ 8 bytes │8 bytes │4 bytes │ │
└─────────┴─────────┴──────┴───────┴─────────┴─────────┴────────┴────────┴───┘
```
The last section is the footer that stores the offset of the start of the index.
```
┌─────────┐
│ Footer │
├─────────┤
│Index Ofs│
│ 8 bytes │
└─────────┘
```
# File System Layout
The file system is organized a directory per shard where each shard is an integer number. Associated with each shard directory, there is a set of other directories and files:
* a wal directory - contains a set numerically increasing files WAL segment files named #####.wal. The wal directory is separate from the directory containing the TSM files so that different types can be used if necessary.
* .tsm files - a set of numerically increasing TSM files containing compressed series data.
* .tombstone files - files named after the corresponding TSM file as #####.tombstone. These contain measurement and series keys that have been deleted. These files are removed during compactions.
# Data Flow
Writes are appended to the current WAL segment and are also added to the Cache. Each WAL segment is size bounded and rolls-over to a new file after it fills up. The cache is also size bounded; snapshots are taken and WAL compactions are initiated when the cache becomes too full. If the inbound write rate exceeds the WAL compaction rate for a sustained period, the cache may become too full in which case new writes will fail until the compaction process catches up. The WAL and Cache are separate entities and do not interact with each other. The Engine coordinates the writes to both.
When WAL segments fill up and have been closed, the Compactor reads the WAL entries and combines them with one or more existing TSM files. This process runs continuously until all WAL files are compacted and there is a minimum number of TSM files. As each TSM file is completed, it is loaded and referenced by the FileStore.
Queries are executed by constructing Cursors for keys. The Cursors iterate over slices of Values. When the current Values are exhausted, a Cursor requests the next set of Values from the Engine. The Engine returns a slice of Values by querying the FileStore and Cache. The Values in the Cache are overlaid on top of the values returned from the FileStore. The FileStore reads and decodes blocks of Values according to the index for the file.
Updates (writing a newer value for a point that already exists) occur as normal writes. Since cached values overwrite existing values, newer writes take precedence.
Deletes occur by writing a delete entry for the measurement or series to the WAL and then updating the Cache and FileStore. The Cache evicts all relevant entries. The FileStore writes a tombstone file for each TSM file that contains relevant data. These tombstone files are used at startup time to ignore blocks as well as during compactions to remove deleted entries.
# Compactions
Compactions are a serial and continuously running process that iteratively optimizes the storage for queries. Specifically, it does the following:
* Converts closed WAL files into TSM files and removes the closed WAL files
* Combines smaller TSM files into larger ones to improve compression ratios
* Rewrites existing files that contain series data that has been deleted
* Rewrites existing files that contain writes with more recent data to ensure a point exists in only one TSM file.
The compaction algorithm is continuously running and always selects files to compact based on a priority.
1. If there are closed WAL files, the 5 oldest WAL segments are added to the set of compaction files.
2. If any TSM files contain points with older timestamps that also exist in the WAL files, those TSM files are added to the compaction set.
3. If any TSM files have a tombstone marker, those TSM files are added to the compaction set.
The compaction algorithm generates a set of SeriesIterators that return a sequence of `key`, `Values` where each `key` returned is lexicographically greater than the previous one. The iterators are ordered such that WAL iterators will override any values returned by the TSM file iterators. WAL iterators read and cache the WAL segment so that deletes later in the log can be processed correctly. TSM file iterators use the tombstone files to ensure that deleted series are not returned during iteration. As each key is processed, the Values slice is grown, sorted, and then written to a new block in the new TSM file. The blocks can be split based on number of points or size of the block. If the total size of the current TSM file would exceed the maximum file size, a new file is created.
Deletions can occur while a new file is being written. Since the new TSM file is not complete a tombstone would not be written for it. This could result in deleted values getting written into a new file. To prevent this, if a compaction is running and a delete occurs, the current compaction is aborted and new compaction is started.
When all WAL files in the current compaction have been processed and the new TSM files have been successfully written, the new TSM files are renamed to their final names, the WAL segments are truncated and the associated snapshots are released from the cache.
The compaction process then runs again until there are no more WAL files and the minimum number of TSM files exist that are also under the maximum file size.
# WAL
Currently, there is a WAL per shard. This means all the writes in a WAL segment are for the given shard. It also means that writes across a lot of shards append to many files which might result in more disk IO due to seeking to the end of multiple files.
Two options are being considered:
## WAL per Shard
This is the current behavior of the WAL. This option is conceptually easier to reason about. For example, compactions that read in multiple WAL segments are assured that all the WAL entries pertain to the current shard. If it completes a compaction, it is safe to remove the WAL segment. It is also easier to deal with shard deletions as all the WAL segments can be dropped along with the other shard files.
The drawback of this option is the potential for turning sequential write IO into random IO in the presence of multiple shards and writes to many different shards.
## Single WAL
Using a single WAL adds some complexity to compactions and deletions. Compactions will need to either sort all the WAL entries in a segment by shard first and then run compactions on each shard or the compactor needs to be able to compact multiple shards concurrently while ensuring points in existing TSM files in different shards remain separate.
Deletions would not be able to reclaim WAL segments immediately as in the case where there is a WAL per shard. Similarly, a compaction of a WAL segment that contains writes for a deleted shard would need to be dropped.
Currently, we are moving towards a Single WAL implementation.
# Cache
The purpose of the cache is so that data in the WAL is queryable. Every time a point is written to a WAL segment, it is also written to an in-memory cache. The cache is split into two parts: a "hot" part, representing the most recent writes and a "cold" part containing snapshots for which an active WAL compaction
process is underway.
Queries are satisfied with values read from the cache and finalized TSM files. Points in the cache always take precedence over points in TSM files with the same timestamp. Queries are never read directly from WAL segment files which are designed to optimize write rather than read performance.
The cache tracks its size on a "point-calculated" basis. "point-calculated" means that the RAM storage footprint for a point is the determined by calling its `Size()` method. While this does not correspond directly to the actual RAM footprint in the cache, the two values are sufficiently well correlated for the purpose of controlling RAM usage.
If the cache becomes too full, or the cache has been idle for too long, a snapshot of the cache is taken and a compaction process is initiated for the related WAL segments. When the compaction of these segments is complete, the related snapshots are released from the cache.
In cases where IO performance of the compaction process falls behind the incoming write rate, it is possible that writes might arrive at the cache while the cache is both too full and the compaction of the previous snapshot is still in progress. In this case, the cache will reject the write, causing the write to fail.
Well behaved clients should interpret write failures as back pressure and should either discard the write or back off and retry the write after a delay.
# TSM File Index
Each TSM file contains a full index of the blocks contained within the file. The existing index structure is designed to allow for a binary search across the index to find the starting block for a key. We would then seek to that start key and sequentially scan each block to find the location of a timestamp.
Some issues with the existing structure is that seeking to a given timestamp for a key has a unknown cost. This can cause variability in read performance that would very difficult to fix. Another issue is that startup times for loading a TSM file would grow in proportion to number and size of TSM files on disk since we would need to scan the entire file to find all keys contained in the file. This could be addressed by using a separate index like file or changing the index structure.
We've chosen to update the block index structure to ensure a TSM file is fully self-contained, supports consistent IO characteristics for sequential and random accesses as well as provides an efficient load time regardless of file size. The implications of these changes are that the index is slightly larger and we need to be able to search the index despite each entry being variably sized.
The following are some alternative design options to handle the cases where the index is too large to fit in memory. We are currently planning to use an indirect MMAP indexing approach for loaded TSM files.
### Indirect MMAP Indexing
One option is to MMAP the index into memory and record the pointers to the start of each index entry in a slice. When searching for a given key, the pointers are used to perform a binary search on the underlying mmap data. When the matching key is found, the block entries can be loaded and search or a subsequent binary search on the blocks can be performed.
A variation of this can also be done without MMAPs by seeking and reading in the file. The underlying file cache will still be utilized in this approach as well.
As an example, if we have an index structure in memory such as:
```
┌────────────────────────────────────────────────────────────────────┐
│ Index │
├─┬──────────────────────┬──┬───────────────────────┬───┬────────────┘
│0│ │62│ │145│
├─┴───────┬─────────┬────┼──┴──────┬─────────┬──────┼───┴─────┬──────┐
│Key 1 Len│ Key │... │Key 2 Len│ Key 2 │ ... │ Key 3 │ ... │
│ 2 bytes │ N bytes │ │ 2 bytes │ N bytes │ │ 2 bytes │ │
└─────────┴─────────┴────┴─────────┴─────────┴──────┴─────────┴──────┘
```
We would build an `offsets` slices where each element pointers to the byte location for the first key in then index slice.
```
┌────────────────────────────────────────────────────────────────────┐
│ Offsets │
├────┬────┬────┬─────────────────────────────────────────────────────┘
│ 0 │ 62 │145 │
└────┴────┴────┘
```
Using this offset slice we can find `Key 2` by doing a binary search over the offsets slice. Instead of comparing the value in the offsets (e.g. `62`), we use that as an index into the underlying index to retrieve the key at position `62` and perform our comparisons with that.
When we have identified the correct position in the index for a given key, we could perform another binary search or a linear scan. This should be fast as well since each index entry is 28 bytes and all contiguous in memory.
The size of the offsets slice would be proportional to the number of unique series. If we we limit file sizes to 4GB, we would use 4 bytes for each pointer.
### LRU/Lazy Load
A second option could be to have the index work as a memory bounded, lazy-load style cache. When a cache miss occurs, the index structure is scanned to find the key and the entries are load and added to the cache which causes the least-recently used entries to be evicted.
### Key Compression
Another option is compress keys using a key specific dictionary encoding. For example,
```
cpu,host=server1 value=1
cpu,host=server2 value=2
memory,host=server1 value=3
```
Could be compressed by expanding the key into its respective parts: measurement, tag keys, tag values and tag fields . For each part a unique number is assigned. e.g.
Measurements
```
cpu = 1
memory = 2
```
Tag Keys
```
host = 1
```
Tag Values
```
server1 = 1
server2 = 2
```
Fields
```
value = 1
```
Using this encoding dictionary, the string keys could be converted to a sequence of integers:
```
cpu,host=server1 value=1 --> 1,1,1,1
cpu,host=server2 value=2 --> 1,1,2,1
memory,host=server1 value=3 --> 3,1,2,1
```
These sequences of small integers list can then be compressed further using a bit packed format such as Simple9 or Simple8b. The resulting byte slices would be a multiple of 4 or 8 bytes (using Simple9/Simple8b respectively) which could used as the (string).
### Separate Index
Another option might be to have a separate index file (BoltDB) that serves as the storage for the `FileIndex` and is transient. This index would be recreated at startup and updated at compaction time.
# Components
These are some of the high-level components and their responsibilities. These are ideas preliminary.
## WAL
* Append-only log composed of fixed size segment files.
* Writes are appended to the current segment
* Roll-over to new segment after filling the current segment
* Closed segments are never modified and used for startup and recovery as well as compactions.
* There is a single WAL for the store as opposed to a WAL per shard.
## Compactor
* Continuously running, iterative file storage optimizer
* Takes closed WAL files, existing TSM files and combines into one or more new TSM files
## Cache
* Hold recently written series data
* Has max size and a flushing limit
* When the flushing limit is crossed, a snapshot is taken and a compaction process for the related WAL segments is commenced.
* If a write comes in, the cache is too full, and the previous snapshot is still being compacted, the write will fail.
# Engine
* Maintains references to Cache, FileStore, WAL, etc..
* Creates a cursor
* Receives writes, coordinates queries
* Hides underlying files and types from clients
## Cursor
* Iterates forward or reverse for given key
* Requests values from Engine for key and timestamp
* Has no knowledge of TSM files or WAL - delegates to Engine to request next set of Values
## FileStore
* Manages TSM files
* Maintains the file indexes and references to active files
* A TSM file that is opened entails reading in and adding the index section to the `FileIndex`. The block data is then MMAPed up to the index offset to avoid having the index in memory twice.
## FileIndex
* Provides location information to a file and block for a given key and timestamp.
## Interfaces
```
SeriesIterator returns the key and []Value such that a key is only returned
once and subsequent calls to Next() do not return the same key twice.
type SeriesIterator interface {
func Next() (key, []Value, error)
}
```
## Types
_NOTE: the actual func names are to illustrate the type of functionality the type is responsible._
```
TSMWriter writes a sets of key and Values to a TSM file.
type TSMWriter struct {}
func (t *TSMWriter) Write(key string, values []Value) error {}
func (t *TSMWriter) Close() error
```
```
// WALIterator returns the key and []Values for a set of WAL segment files.
type WALIterator struct{
Files *os.File
}
func (r *WALReader) Next() (key, []Value, error)
```
```
TSMIterator returns the key and values from a TSM file.
type TSMIterator struct {}
func (r *TSMIterator) Next() (key, []Value, error)
```
```
type Compactor struct {}
func (c *Compactor) Compact(iters ...SeriesIterators) error
```
```
type Engine struct {
wal *WAL
cache *Cache
fileStore *FileStore
compactor *Compactor
}
func (e *Engine) ValuesBefore(key string, timestamp time.Time) ([]Value, error)
func (e *Engine) ValuesAfter(key string, timestamp time.Time) ([]Value, error)
```
```
type Cursor struct{
engine *Engine
}
...
```
```
// FileStore maintains references
type FileStore struct {}
func (f *FileStore) ValuesBefore(key string, timestamp time.Time) ([]Value, error)
func (f *FileStore) ValuesAfter(key string, timestamp time.Time) ([]Value, error)
```
```
type FileIndex struct {}
// Returns a file and offset for a block located in the return file that contains the requested key and timestamp.
func (f *FileIndex) Location(key, timestamp) (*os.File, uint64, error)
```
```
type Cache struct {}
func (c *Cache) Write(key string, values []Value, checkpoint uint64) error
func (c *Cache) SetCheckpoint(checkpoint uint64) error
func (c *Cache) Cursor(key string) tsdb.Cursor
```
```
type WAL struct {}
func (w *WAL) Write(key string, values []Value)
func (w *WAL) ClosedSegments() ([]*os.File, error)
```
# Concerns
## Performance
There are three categories of performance this design is concerned with:
* Write Throughput/Latency
* Query Throughput/Latency
* Startup time
* Compaction Throughput/Latency
* Memory Usage
### Writes
Write throughput is bounded by the time to process the write on the CPU (parsing, sorting, etc..), adding and evicting to the Cache and appending the write to the WAL. The first two items are CPU bound and can be tuned and optimized if they become a bottleneck. The WAL write can be tuned such that in the worst case every write requires at least 2 IOPS (write + fsync) or batched so that multiple writes are queued and fsync'd in sizes matching one or more disk blocks. Performing more work with each IO will improve throughput
Write latency is minimal for the WAL write since there are no seeks. The latency is bounded by the time to complete any write and fsync calls.
### Queries
Query throughput is directly related to how many blocks can be read in a period of time. The index structure contains enough information to determine if one or multiple blocks can be read in a single IO.
Query latency is determine by how long it takes to find and read the relevant blocks. The in-memory index structure contains the offsets and sizes of all blocks for a key. This allows every block to be read in 2 IOPS (seek + read) regardless of position, structure or size of file.
### Startup
Startup time is proportional to the number of WAL files, TSM files and tombstone files. WAL files can be read and process in large batches using the WALIterators. TSM files require reading the index block into memory (5 IOPS/file). Tombstone files are expected to be small and infrequent and would require approximately 2 IOPS/file.
### Compactions
Compactions are IO intensive in that they may need to read multiple, large TSM files to rewrite them. The throughput of a compactions (MB/s) as well as the latency for each compaction is important to keep consistent even as data sizes grow.
To address these concerns, compactions prioritize old WAL files over optimizing storage/compression to avoid data being hidden during overload situations. This also accounts for the fact that shards will eventually become cold for writes so that existing data will be able to be optimized. To maintain consistent performance, the number of each type of file processed as well as the size of each file processed is bounded.
### Memory Footprint
The memory footprint should not grow unbounded due to additional files or series keys of large sizes or numbers. Some options for addressing this concern is covered in the [Design Options] section.
## Concurrency
The main concern with concurrency is that reads and writes should not block each other. Writes add entries to the Cache and append entries to the WAL. During queries, the contention points will be the Cache and existing TSM files. Since the Cache and TSM file data is only accessed through the engine by the cursors, several strategies can be used to improve concurrency.
1. cached series data is returned to cursors as a copy. Since cache snapshots are released following compaction, cursor iteration and writes to the same series could block each other. Iterating over copies of the values can relieve some of this contention.
2. TSM data values returned by the engine are new references to Values and not access to the actual TSM files. This means that the `Engine`, through the `FileStore` can limit contention.
3. Compactions are the only place where new TSM files are added and removed. Since this is a serial, continuously running process, file contention is minimized.
## Robustness
The two robustness concerns considered by this design are writes filling the cache and crash recovery.
### Cache Exhaustion
The cache is used to hold the contents of uncompacted WAL segments in memory until such time that the compaction process has had a chance to convert the write-optimised WAL segments into read-optimised TSM files.
The question arises about what to do in the case that the inbound write rate temporarily exceeds the compaction rate. There are four alternatives:
* block the write until the compaction process catches up
* cache the write and hope that the compaction process catches up before memory exhaustion occurs
* evict older cache entries to make room for new writes
* fail the write and propagate the error back to the database client as a form of back pressure
The current design chooses the last option - failing the writes. While this option reduces the apparent robustness of the database API from the perspective of the clients, it does provide a means by which the database can communicate, via back pressure, the need for clients to temporarily backoff. Well behaved clients should respond to write errors either by discarding the write or by retrying the write after a delay in the hope that the compaction process will eventually catch up. The problem with the first two options is that they may exhaust server resources. The problem with the third option is that queries (which don't touch WAL segments) might silently return incomplete results during compaction periods; with the selected option the possibility of incomplete queries is at least flagged by the presence of write errors during periods of degraded compaction performance.
### Crash Recovery
Crash recovery is facilitated with the following two properties: the append-only nature of WAL segments and the write-once nature of TSM files. If the server crashes incomplete compactions are discarded and the cache is rebuilt from the discovered WAL segments. Compactions will then resume in the normal way. Similarly, TSM files are immutable once they have been created and registered with the file store. A compaction may replace an existing TSM file, but the replaced file is not removed from the file system until replacement file has been created and synced to disk.
#Errata
This section is reserved for errata. In cases where the document is incorrect or inconsistent, such errata will be noted here with the contents of this section taking precedence over text elsewhere in the document in the case of discrepancies. Future full revisions of this document will fold the errata text back into the body of the document.
#Revisions
##14 February, 2016
* refined description of cache behaviour and robustness to reflect current design based on snapshots. Most references to checkpoints and evictions have been removed. See discussion here - https://goo.gl/L7AzVu
##11 November, 2015
* initial design published

View File

@@ -0,0 +1,5 @@
{
"files": [
"00000001.tsl"
]
}

View File

@@ -0,0 +1,133 @@
package tsm1
import "io"
// BitReader reads bits from an io.Reader.
type BitReader struct {
data []byte
buf struct {
v uint64 // bit buffer
n uint // available bits
}
}
// NewBitReader returns a new instance of BitReader that reads from data.
func NewBitReader(data []byte) *BitReader {
b := new(BitReader)
b.Reset(data)
return b
}
// Reset sets the underlying reader on b and reinitializes.
func (r *BitReader) Reset(data []byte) {
r.data = data
r.buf.v, r.buf.n = 0, 0
r.readBuf()
}
// CanReadBitFast returns true if calling ReadBitFast() is allowed.
// Fast bit reads are allowed when at least 2 values are in the buffer.
// This is because it is not required to refilled the buffer and the caller
// can inline the calls.
func (r *BitReader) CanReadBitFast() bool { return r.buf.n > 1 }
// ReadBitFast is an optimized bit read.
// IMPORTANT: Only allowed if CanReadFastBit() is true!
func (r *BitReader) ReadBitFast() bool {
v := (r.buf.v&(1<<63) != 0)
r.buf.v <<= 1
r.buf.n -= 1
return v
}
// ReadBit returns the next bit from the underlying data.
func (r *BitReader) ReadBit() (bool, error) {
v, err := r.ReadBits(1)
return v != 0, err
}
// ReadBits reads nbits from the underlying data into a uint64.
// nbits must be from 1 to 64, inclusive.
func (r *BitReader) ReadBits(nbits uint) (uint64, error) {
// Return EOF if there is no more data.
if r.buf.n == 0 {
return 0, io.EOF
}
// Return bits from buffer if less than available bits.
if nbits <= r.buf.n {
// Return all bits, if requested.
if nbits == 64 {
v := r.buf.v
r.buf.v, r.buf.n = 0, 0
r.readBuf()
return v, nil
}
// Otherwise mask returned bits.
v := (r.buf.v >> (64 - nbits))
r.buf.v <<= nbits
r.buf.n -= nbits
if r.buf.n == 0 {
r.readBuf()
}
return v, nil
}
// Otherwise read all available bits in current buffer.
v, n := r.buf.v, r.buf.n
// Read new buffer.
r.buf.v, r.buf.n = 0, 0
r.readBuf()
// Append new buffer to previous buffer and shift to remove unnecessary bits.
v |= (r.buf.v >> n)
v >>= 64 - nbits
// Remove used bits from new buffer.
bufN := nbits - n
if bufN > r.buf.n {
bufN = r.buf.n
}
r.buf.v <<= bufN
r.buf.n -= bufN
if r.buf.n == 0 {
r.readBuf()
}
return v, nil
}
func (r *BitReader) readBuf() {
// Determine number of bytes to read to fill buffer.
byteN := 8 - (r.buf.n / 8)
// Limit to the length of our data.
if n := uint(len(r.data)); byteN > n {
byteN = n
}
// Optimized 8-byte read.
if byteN == 8 {
r.buf.v = uint64(r.data[7]) | uint64(r.data[6])<<8 |
uint64(r.data[5])<<16 | uint64(r.data[4])<<24 |
uint64(r.data[3])<<32 | uint64(r.data[2])<<40 |
uint64(r.data[1])<<48 | uint64(r.data[0])<<56
r.buf.n = 64
r.data = r.data[8:]
return
}
// Otherwise append bytes to buffer.
for i := uint(0); i < byteN; i++ {
r.buf.n += 8
r.buf.v |= uint64(r.data[i]) << (64 - r.buf.n)
}
// Move data forward.
r.data = r.data[byteN:]
}

View File

@@ -0,0 +1,180 @@
package tsm1_test
import (
"bytes"
"io"
"math"
"math/rand"
"reflect"
"testing"
"testing/quick"
"github.com/dgryski/go-bitstream"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
)
func TestBitStreamEOF(t *testing.T) {
br := tsm1.NewBitReader([]byte("0"))
b, err := br.ReadBits(8)
if err != nil {
t.Fatal(err)
}
if b != '0' {
t.Error("ReadBits(8) didn't return first byte")
}
if _, err := br.ReadBits(8); err != io.EOF {
t.Error("ReadBits(8) on empty string didn't return EOF")
}
// 0 = 0b00110000
br = tsm1.NewBitReader([]byte("0"))
buf := bytes.NewBuffer(nil)
bw := bitstream.NewWriter(buf)
for i := 0; i < 4; i++ {
bit, err := br.ReadBit()
if err == io.EOF {
break
}
if err != nil {
t.Error("GetBit returned error err=", err.Error())
return
}
bw.WriteBit(bitstream.Bit(bit))
}
bw.Flush(bitstream.One)
err = bw.WriteByte(0xAA)
if err != nil {
t.Error("unable to WriteByte")
}
c := buf.Bytes()
if len(c) != 2 || c[1] != 0xAA || c[0] != 0x3f {
t.Error("bad return from 4 read bytes")
}
_, err = tsm1.NewBitReader([]byte("")).ReadBit()
if err != io.EOF {
t.Error("ReadBit on empty string didn't return EOF")
}
}
func TestBitStream(t *testing.T) {
buf := bytes.NewBuffer(nil)
br := tsm1.NewBitReader([]byte("hello"))
bw := bitstream.NewWriter(buf)
for {
bit, err := br.ReadBit()
if err == io.EOF {
break
}
if err != nil {
t.Error("GetBit returned error err=", err.Error())
return
}
bw.WriteBit(bitstream.Bit(bit))
}
s := buf.String()
if s != "hello" {
t.Error("expected 'hello', got=", []byte(s))
}
}
func TestByteStream(t *testing.T) {
buf := bytes.NewBuffer(nil)
br := tsm1.NewBitReader([]byte("hello"))
bw := bitstream.NewWriter(buf)
for i := 0; i < 3; i++ {
bit, err := br.ReadBit()
if err == io.EOF {
break
}
if err != nil {
t.Error("GetBit returned error err=", err.Error())
return
}
bw.WriteBit(bitstream.Bit(bit))
}
for i := 0; i < 3; i++ {
byt, err := br.ReadBits(8)
if err == io.EOF {
break
}
if err != nil {
t.Error("ReadBits(8) returned error err=", err.Error())
return
}
bw.WriteByte(byte(byt))
}
u, err := br.ReadBits(13)
if err != nil {
t.Error("ReadBits returned error err=", err.Error())
return
}
bw.WriteBits(u, 13)
bw.WriteBits(('!'<<12)|('.'<<4)|0x02, 20)
// 0x2f == '/'
bw.Flush(bitstream.One)
s := buf.String()
if s != "hello!./" {
t.Errorf("expected 'hello!./', got=%x", []byte(s))
}
}
// Ensure bit reader can read random bits written to a stream.
func TestBitReader_Quick(t *testing.T) {
if err := quick.Check(func(values []uint64, nbits []uint) bool {
// Limit nbits to 64.
for i := 0; i < len(values) && i < len(nbits); i++ {
nbits[i] = (nbits[i] % 64) + 1
values[i] = values[i] & (math.MaxUint64 >> (64 - nbits[i]))
}
// Write bits to a buffer.
var buf bytes.Buffer
w := bitstream.NewWriter(&buf)
for i := 0; i < len(values) && i < len(nbits); i++ {
w.WriteBits(values[i], int(nbits[i]))
}
w.Flush(bitstream.Zero)
// Read bits from the buffer.
r := tsm1.NewBitReader(buf.Bytes())
for i := 0; i < len(values) && i < len(nbits); i++ {
v, err := r.ReadBits(nbits[i])
if err != nil {
t.Errorf("unexpected error(%d): %s", i, err)
return false
} else if v != values[i] {
t.Errorf("value mismatch(%d): got=%d, exp=%d (nbits=%d)", i, v, values[i], nbits[i])
return false
}
}
return true
}, &quick.Config{
Values: func(a []reflect.Value, rand *rand.Rand) {
a[0], _ = quick.Value(reflect.TypeOf([]uint64{}), rand)
a[1], _ = quick.Value(reflect.TypeOf([]uint{}), rand)
},
}); err != nil {
t.Fatal(err)
}
}

View File

@@ -0,0 +1,174 @@
package tsm1
// boolean encoding uses 1 bit per value. Each compressed byte slice contains a 1 byte header
// indicating the compression type, followed by a variable byte encoded length indicating
// how many booleans are packed in the slice. The remaining bytes contains 1 byte for every
// 8 boolean values encoded.
import (
"encoding/binary"
"fmt"
)
const (
// booleanUncompressed is an uncompressed boolean format.
// Not yet implemented.
booleanUncompressed = 0
// booleanCompressedBitPacked is an bit packed format using 1 bit per boolean
booleanCompressedBitPacked = 1
)
// BooleanEncoder encodes a series of booleans to an in-memory buffer.
type BooleanEncoder struct {
// The encoded bytes
bytes []byte
// The current byte being encoded
b byte
// The number of bools packed into b
i int
// The total number of bools written
n int
}
// NewBooleanEncoder returns a new instance of BooleanEncoder.
func NewBooleanEncoder(sz int) BooleanEncoder {
return BooleanEncoder{
bytes: make([]byte, 0, (sz+7)/8),
}
}
// Reset sets the encoder to its initial state.
func (e *BooleanEncoder) Reset() {
e.bytes = e.bytes[:0]
e.b = 0
e.i = 0
e.n = 0
}
// Write encodes b to the underlying buffer.
func (e *BooleanEncoder) Write(b bool) {
// If we have filled the current byte, flush it
if e.i >= 8 {
e.flush()
}
// Use 1 bit for each boolean value, shift the current byte
// by 1 and set the least signficant bit acordingly
e.b = e.b << 1
if b {
e.b |= 1
}
// Increment the current boolean count
e.i++
// Increment the total boolean count
e.n++
}
func (e *BooleanEncoder) flush() {
// Pad remaining byte w/ 0s
for e.i < 8 {
e.b = e.b << 1
e.i++
}
// If we have bits set, append them to the byte slice
if e.i > 0 {
e.bytes = append(e.bytes, e.b)
e.b = 0
e.i = 0
}
}
// Flush is no-op
func (e *BooleanEncoder) Flush() {}
// Bytes returns a new byte slice containing the encoded booleans from previous calls to Write.
func (e *BooleanEncoder) Bytes() ([]byte, error) {
// Ensure the current byte is flushed
e.flush()
b := make([]byte, 10+1)
// Store the encoding type in the 4 high bits of the first byte
b[0] = byte(booleanCompressedBitPacked) << 4
i := 1
// Encode the number of booleans written
i += binary.PutUvarint(b[i:], uint64(e.n))
// Append the packed booleans
return append(b[:i], e.bytes...), nil
}
// BooleanDecoder decodes a series of booleans from an in-memory buffer.
type BooleanDecoder struct {
b []byte
i int
n int
err error
}
// SetBytes initializes the decoder with a new set of bytes to read from.
// This must be called before calling any other methods.
func (e *BooleanDecoder) SetBytes(b []byte) {
if len(b) == 0 {
return
}
// First byte stores the encoding type, only have 1 bit-packet format
// currently ignore for now.
b = b[1:]
count, n := binary.Uvarint(b)
if n <= 0 {
e.err = fmt.Errorf("BooleanDecoder: invalid count")
return
}
e.b = b[n:]
e.i = -1
e.n = int(count)
if min := len(e.b) * 8; min < e.n {
// Shouldn't happen - TSM file was truncated/corrupted
e.n = min
}
}
// Next returns whether there are any bits remaining in the decoder.
// It returns false if there was an error decoding.
// The error is available on the Error method.
func (e *BooleanDecoder) Next() bool {
if e.err != nil {
return false
}
e.i++
return e.i < e.n
}
// Read returns the next bit from the decoder.
func (e *BooleanDecoder) Read() bool {
// Index into the byte slice
idx := e.i >> 3 // integer division by 8
// Bit position
pos := 7 - (e.i & 0x7)
// The mask to select the bit
mask := byte(1 << uint(pos))
// The packed byte
v := e.b[idx]
// Returns true if the bit is set
return v&mask == mask
}
// Error returns the error encountered during decoding, if one occurred.
func (e *BooleanDecoder) Error() error {
return e.err
}

View File

@@ -0,0 +1,161 @@
package tsm1_test
import (
"reflect"
"testing"
"testing/quick"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
)
func Test_BooleanEncoder_NoValues(t *testing.T) {
enc := tsm1.NewBooleanEncoder(0)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec tsm1.BooleanDecoder
dec.SetBytes(b)
if dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
}
func Test_BooleanEncoder_Single(t *testing.T) {
enc := tsm1.NewBooleanEncoder(1)
v1 := true
enc.Write(v1)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec tsm1.BooleanDecoder
dec.SetBytes(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got false, exp true")
}
if v1 != dec.Read() {
t.Fatalf("unexpected value: got %v, exp %v", dec.Read(), v1)
}
}
func Test_BooleanEncoder_Multi_Compressed(t *testing.T) {
enc := tsm1.NewBooleanEncoder(10)
values := make([]bool, 10)
for i := range values {
values[i] = i%2 == 0
enc.Write(values[i])
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if exp := 4; len(b) != exp {
t.Fatalf("unexpected length: got %v, exp %v", len(b), exp)
}
var dec tsm1.BooleanDecoder
dec.SetBytes(b)
for i, v := range values {
if !dec.Next() {
t.Fatalf("unexpected next value: got false, exp true")
}
if v != dec.Read() {
t.Fatalf("unexpected value at pos %d: got %v, exp %v", i, dec.Read(), v)
}
}
if dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
}
func Test_BooleanEncoder_Quick(t *testing.T) {
if err := quick.Check(func(values []bool) bool {
expected := values
if values == nil {
expected = []bool{}
}
// Write values to encoder.
enc := tsm1.NewBooleanEncoder(1024)
for _, v := range values {
enc.Write(v)
}
// Retrieve compressed bytes.
buf, err := enc.Bytes()
if err != nil {
t.Fatal(err)
}
// Read values out of decoder.
got := make([]bool, 0, len(values))
var dec tsm1.BooleanDecoder
dec.SetBytes(buf)
for dec.Next() {
got = append(got, dec.Read())
}
// Verify that input and output values match.
if !reflect.DeepEqual(expected, got) {
t.Fatalf("mismatch:\n\nexp=%#v\n\ngot=%#v\n\n", expected, got)
}
return true
}, nil); err != nil {
t.Fatal(err)
}
}
func Test_BooleanDecoder_Corrupt(t *testing.T) {
cases := []string{
"", // Empty
"\x10\x90", // Packed: invalid count
"\x10\x7f", // Packed: count greater than remaining bits, multiple bytes expected
"\x10\x01", // Packed: count greater than remaining bits, one byte expected
}
for _, c := range cases {
var dec tsm1.BooleanDecoder
dec.SetBytes([]byte(c))
if dec.Next() {
t.Fatalf("exp next == false, got true for case %q", c)
}
}
}
func BenchmarkBooleanDecoder_2048(b *testing.B) { benchmarkBooleanDecoder(b, 2048) }
func benchmarkBooleanDecoder(b *testing.B, size int) {
e := tsm1.NewBooleanEncoder(size)
for i := 0; i < size; i++ {
e.Write(i&1 == 1)
}
bytes, err := e.Bytes()
if err != nil {
b.Fatalf("unexpected error: %v", err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
var d tsm1.BooleanDecoder
d.SetBytes(bytes)
var n int
for d.Next() {
_ = d.Read()
n++
}
if n != size {
b.Fatalf("expected to read %d booleans, but read %d", size, n)
}
}
}

View File

@@ -0,0 +1,766 @@
package tsm1
import (
"fmt"
"math"
"os"
"sync"
"sync/atomic"
"time"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb"
"github.com/uber-go/zap"
)
// ringShards specifies the number of partitions that the hash ring used to
// store the entry mappings contains. It must be a power of 2. From empirical
// testing, a value above the number of cores on the machine does not provide
// any additional benefit. For now we'll set it to the number of cores on the
// largest box we could imagine running influx.
const ringShards = 4096
var (
// ErrSnapshotInProgress is returned if a snapshot is attempted while one is already running.
ErrSnapshotInProgress = fmt.Errorf("snapshot in progress")
)
// ErrCacheMemorySizeLimitExceeded returns an error indicating an operation
// could not be completed due to exceeding the cache-max-memory-size setting.
func ErrCacheMemorySizeLimitExceeded(n, limit uint64) error {
return fmt.Errorf("cache-max-memory-size exceeded: (%d/%d)", n, limit)
}
// entry is a set of values and some metadata.
type entry struct {
mu sync.RWMutex
values Values // All stored values.
// The type of values stored. Read only so doesn't need to be protected by
// mu.
vtype int
}
// newEntryValues returns a new instance of entry with the given values. If the
// values are not valid, an error is returned.
//
// newEntryValues takes an optional hint to indicate the initial buffer size.
// The hint is only respected if it's positive.
func newEntryValues(values []Value, hint int) (*entry, error) {
// Ensure we start off with a reasonably sized values slice.
if hint < 32 {
hint = 32
}
e := &entry{}
if len(values) > hint {
e.values = make(Values, 0, len(values))
} else {
e.values = make(Values, 0, hint)
}
e.values = append(e.values, values...)
// No values, don't check types and ordering
if len(values) == 0 {
return e, nil
}
et := valueType(values[0])
for _, v := range values {
// Make sure all the values are the same type
if et != valueType(v) {
return nil, tsdb.ErrFieldTypeConflict
}
}
// Set the type of values stored.
e.vtype = et
return e, nil
}
// add adds the given values to the entry.
func (e *entry) add(values []Value) error {
if len(values) == 0 {
return nil // Nothing to do.
}
// Are any of the new values the wrong type?
for _, v := range values {
if e.vtype != valueType(v) {
return tsdb.ErrFieldTypeConflict
}
}
// entry currently has no values, so add the new ones and we're done.
e.mu.Lock()
if len(e.values) == 0 {
// Ensure we start off with a reasonably sized values slice.
if len(values) < 32 {
e.values = make(Values, 0, 32)
e.values = append(e.values, values...)
} else {
e.values = values
}
e.mu.Unlock()
return nil
}
// Append the new values to the existing ones...
e.values = append(e.values, values...)
e.mu.Unlock()
return nil
}
// deduplicate sorts and orders the entry's values. If values are already deduped and sorted,
// the function does no work and simply returns.
func (e *entry) deduplicate() {
e.mu.Lock()
defer e.mu.Unlock()
if len(e.values) == 0 {
return
}
e.values = e.values.Deduplicate()
}
// count returns the number of values in this entry.
func (e *entry) count() int {
e.mu.RLock()
n := len(e.values)
e.mu.RUnlock()
return n
}
// filter removes all values with timestamps between min and max inclusive.
func (e *entry) filter(min, max int64) {
e.mu.Lock()
e.values = e.values.Exclude(min, max)
e.mu.Unlock()
}
// size returns the size of this entry in bytes.
func (e *entry) size() int {
e.mu.RLock()
sz := e.values.Size()
e.mu.RUnlock()
return sz
}
// InfluxQLType returns for the entry the data type of its values.
func (e *entry) InfluxQLType() (influxql.DataType, error) {
e.mu.RLock()
defer e.mu.RUnlock()
return e.values.InfluxQLType()
}
// Statistics gathered by the Cache.
const (
// levels - point in time measures
statCacheMemoryBytes = "memBytes" // level: Size of in-memory cache in bytes
statCacheDiskBytes = "diskBytes" // level: Size of on-disk snapshots in bytes
statSnapshots = "snapshotCount" // level: Number of active snapshots.
statCacheAgeMs = "cacheAgeMs" // level: Number of milliseconds since cache was last snapshoted at sample time
// counters - accumulative measures
statCachedBytes = "cachedBytes" // counter: Total number of bytes written into snapshots.
statWALCompactionTimeMs = "WALCompactionTimeMs" // counter: Total number of milliseconds spent compacting snapshots
statCacheWriteOK = "writeOk"
statCacheWriteErr = "writeErr"
statCacheWriteDropped = "writeDropped"
)
// storer is the interface that descibes a cache's store.
type storer interface {
entry(key string) (*entry, bool) // Get an entry by its key.
write(key string, values Values) error // Write an entry to the store.
add(key string, entry *entry) // Add a new entry to the store.
remove(key string) // Remove an entry from the store.
keys(sorted bool) []string // Return an optionally sorted slice of entry keys.
apply(f func(string, *entry) error) error // Apply f to all entries in the store in parallel.
applySerial(f func(string, *entry) error) error // Apply f to all entries in serial.
reset() // Reset the store to an initial unused state.
}
// Cache maintains an in-memory store of Values for a set of keys.
type Cache struct {
// Due to a bug in atomic size needs to be the first word in the struct, as
// that's the only place where you're guaranteed to be 64-bit aligned on a
// 32 bit system. See: https://golang.org/pkg/sync/atomic/#pkg-note-BUG
size uint64
snapshotSize uint64
mu sync.RWMutex
store storer
maxSize uint64
// snapshots are the cache objects that are currently being written to tsm files
// they're kept in memory while flushing so they can be queried along with the cache.
// they are read only and should never be modified
snapshot *Cache
snapshotting bool
// This number is the number of pending or failed WriteSnaphot attempts since the last successful one.
snapshotAttempts int
stats *CacheStatistics
lastSnapshot time.Time
// A one time synchronization used to initial the cache with a store. Since the store can allocate a
// a large amount memory across shards, we lazily create it.
initialize atomic.Value
initializedCount uint32
}
// NewCache returns an instance of a cache which will use a maximum of maxSize bytes of memory.
// Only used for engine caches, never for snapshots.
func NewCache(maxSize uint64, path string) *Cache {
c := &Cache{
maxSize: maxSize,
store: emptyStore{},
stats: &CacheStatistics{},
lastSnapshot: time.Now(),
}
c.initialize.Store(&sync.Once{})
c.UpdateAge()
c.UpdateCompactTime(0)
c.updateCachedBytes(0)
c.updateMemSize(0)
c.updateSnapshots()
return c
}
// CacheStatistics hold statistics related to the cache.
type CacheStatistics struct {
MemSizeBytes int64
DiskSizeBytes int64
SnapshotCount int64
CacheAgeMs int64
CachedBytes int64
WALCompactionTimeMs int64
WriteOK int64
WriteErr int64
WriteDropped int64
}
// Statistics returns statistics for periodic monitoring.
func (c *Cache) Statistics(tags map[string]string) []models.Statistic {
return []models.Statistic{{
Name: "tsm1_cache",
Tags: tags,
Values: map[string]interface{}{
statCacheMemoryBytes: atomic.LoadInt64(&c.stats.MemSizeBytes),
statCacheDiskBytes: atomic.LoadInt64(&c.stats.DiskSizeBytes),
statSnapshots: atomic.LoadInt64(&c.stats.SnapshotCount),
statCacheAgeMs: atomic.LoadInt64(&c.stats.CacheAgeMs),
statCachedBytes: atomic.LoadInt64(&c.stats.CachedBytes),
statWALCompactionTimeMs: atomic.LoadInt64(&c.stats.WALCompactionTimeMs),
statCacheWriteOK: atomic.LoadInt64(&c.stats.WriteOK),
statCacheWriteErr: atomic.LoadInt64(&c.stats.WriteErr),
statCacheWriteDropped: atomic.LoadInt64(&c.stats.WriteDropped),
},
}}
}
// init initializes the cache and allocates the underlying store. Once initialized,
// the store re-used until Freed.
func (c *Cache) init() {
if !atomic.CompareAndSwapUint32(&c.initializedCount, 0, 1) {
return
}
c.mu.Lock()
c.store, _ = newring(ringShards)
c.mu.Unlock()
}
// Free releases the underlying store and memory held by the Cache.
func (c *Cache) Free() {
if !atomic.CompareAndSwapUint32(&c.initializedCount, 1, 0) {
return
}
c.mu.Lock()
c.store = emptyStore{}
c.mu.Unlock()
}
// Write writes the set of values for the key to the cache. This function is goroutine-safe.
// It returns an error if the cache will exceed its max size by adding the new values.
func (c *Cache) Write(key string, values []Value) error {
c.init()
addedSize := uint64(Values(values).Size())
// Enough room in the cache?
limit := c.maxSize
n := c.Size() + addedSize
if limit > 0 && n > limit {
atomic.AddInt64(&c.stats.WriteErr, 1)
return ErrCacheMemorySizeLimitExceeded(n, limit)
}
if err := c.store.write(key, values); err != nil {
atomic.AddInt64(&c.stats.WriteErr, 1)
return err
}
// Update the cache size and the memory size stat.
c.increaseSize(addedSize)
c.updateMemSize(int64(addedSize))
atomic.AddInt64(&c.stats.WriteOK, 1)
return nil
}
// WriteMulti writes the map of keys and associated values to the cache. This
// function is goroutine-safe. It returns an error if the cache will exceeded
// its max size by adding the new values. The write attempts to write as many
// values as possible. If one key fails, the others can still succeed and an
// error will be returned.
func (c *Cache) WriteMulti(values map[string][]Value) error {
c.init()
var addedSize uint64
for _, v := range values {
addedSize += uint64(Values(v).Size())
}
// Enough room in the cache?
limit := c.maxSize // maxSize is safe for reading without a lock.
n := c.Size() + addedSize
if limit > 0 && n > limit {
atomic.AddInt64(&c.stats.WriteErr, 1)
return ErrCacheMemorySizeLimitExceeded(n, limit)
}
var werr error
c.mu.RLock()
store := c.store
c.mu.RUnlock()
// We'll optimistially set size here, and then decrement it for write errors.
c.increaseSize(addedSize)
for k, v := range values {
if err := store.write(k, v); err != nil {
// The write failed, hold onto the error and adjust the size delta.
werr = err
addedSize -= uint64(Values(v).Size())
c.decreaseSize(uint64(Values(v).Size()))
}
}
// Some points in the batch were dropped. An error is returned so
// error stat is incremented as well.
if werr != nil {
atomic.AddInt64(&c.stats.WriteDropped, 1)
atomic.AddInt64(&c.stats.WriteErr, 1)
}
// Update the memory size stat
c.updateMemSize(int64(addedSize))
atomic.AddInt64(&c.stats.WriteOK, 1)
return werr
}
// Snapshot takes a snapshot of the current cache, adds it to the slice of caches that
// are being flushed, and resets the current cache with new values.
func (c *Cache) Snapshot() (*Cache, error) {
c.init()
c.mu.Lock()
defer c.mu.Unlock()
if c.snapshotting {
return nil, ErrSnapshotInProgress
}
c.snapshotting = true
c.snapshotAttempts++ // increment the number of times we tried to do this
// If no snapshot exists, create a new one, otherwise update the existing snapshot
if c.snapshot == nil {
store, err := newring(ringShards)
if err != nil {
return nil, err
}
c.snapshot = &Cache{
store: store,
}
}
// Did a prior snapshot exist that failed? If so, return the existing
// snapshot to retry.
if c.snapshot.Size() > 0 {
return c.snapshot, nil
}
c.snapshot.store, c.store = c.store, c.snapshot.store
snapshotSize := c.Size()
// Save the size of the snapshot on the snapshot cache
atomic.StoreUint64(&c.snapshot.size, snapshotSize)
// Save the size of the snapshot on the live cache
atomic.StoreUint64(&c.snapshotSize, snapshotSize)
// Reset the cache's store.
c.store.reset()
atomic.StoreUint64(&c.size, 0)
c.lastSnapshot = time.Now()
c.updateCachedBytes(snapshotSize) // increment the number of bytes added to the snapshot
c.updateSnapshots()
return c.snapshot, nil
}
// Deduplicate sorts the snapshot before returning it. The compactor and any queries
// coming in while it writes will need the values sorted.
func (c *Cache) Deduplicate() {
c.mu.RLock()
store := c.store
c.mu.RUnlock()
// Apply a function that simply calls deduplicate on each entry in the ring.
// apply cannot return an error in this invocation.
_ = store.apply(func(_ string, e *entry) error { e.deduplicate(); return nil })
}
// ClearSnapshot removes the snapshot cache from the list of flushing caches and
// adjusts the size.
func (c *Cache) ClearSnapshot(success bool) {
c.init()
c.mu.Lock()
defer c.mu.Unlock()
c.snapshotting = false
if success {
c.snapshotAttempts = 0
c.updateMemSize(-int64(atomic.LoadUint64(&c.snapshotSize))) // decrement the number of bytes in cache
// Reset the snapshot's store, and reset the snapshot to a fresh Cache.
c.snapshot.store.reset()
c.snapshot = &Cache{
store: c.snapshot.store,
}
atomic.StoreUint64(&c.snapshotSize, 0)
c.updateSnapshots()
}
}
// Size returns the number of point-calcuated bytes the cache currently uses.
func (c *Cache) Size() uint64 {
return atomic.LoadUint64(&c.size) + atomic.LoadUint64(&c.snapshotSize)
}
// increaseSize increases size by delta.
func (c *Cache) increaseSize(delta uint64) {
atomic.AddUint64(&c.size, delta)
}
// decreaseSize decreases size by delta.
func (c *Cache) decreaseSize(delta uint64) {
// Per sync/atomic docs, bit-flip delta minus one to perform subtraction within AddUint64.
atomic.AddUint64(&c.size, ^(delta - 1))
}
// MaxSize returns the maximum number of bytes the cache may consume.
func (c *Cache) MaxSize() uint64 {
return c.maxSize
}
// Keys returns a sorted slice of all keys under management by the cache.
func (c *Cache) Keys() []string {
c.mu.RLock()
store := c.store
c.mu.RUnlock()
return store.keys(true)
}
// unsortedKeys returns a slice of all keys under management by the cache. The
// keys are not sorted.
func (c *Cache) unsortedKeys() []string {
c.mu.RLock()
store := c.store
c.mu.RUnlock()
return store.keys(false)
}
// Values returns a copy of all values, deduped and sorted, for the given key.
func (c *Cache) Values(key string) Values {
var snapshotEntries *entry
c.mu.RLock()
e, ok := c.store.entry(key)
if c.snapshot != nil {
snapshotEntries, _ = c.snapshot.store.entry(key)
}
c.mu.RUnlock()
if !ok {
if snapshotEntries == nil {
// No values in hot cache or snapshots.
return nil
}
} else {
e.deduplicate()
}
// Build the sequence of entries that will be returned, in the correct order.
// Calculate the required size of the destination buffer.
var entries []*entry
sz := 0
if snapshotEntries != nil {
snapshotEntries.deduplicate() // guarantee we are deduplicated
entries = append(entries, snapshotEntries)
sz += snapshotEntries.count()
}
if e != nil {
entries = append(entries, e)
sz += e.count()
}
// Any entries? If not, return.
if sz == 0 {
return nil
}
// Create the buffer, and copy all hot values and snapshots. Individual
// entries are sorted at this point, so now the code has to check if the
// resultant buffer will be sorted from start to finish.
values := make(Values, sz)
n := 0
for _, e := range entries {
e.mu.RLock()
n += copy(values[n:], e.values)
e.mu.RUnlock()
}
values = values[:n]
values = values.Deduplicate()
return values
}
// Delete removes all values for the given keys from the cache.
func (c *Cache) Delete(keys []string) {
c.DeleteRange(keys, math.MinInt64, math.MaxInt64)
}
// DeleteRange removes the values for all keys containing points
// with timestamps between between min and max from the cache.
//
// TODO(edd): Lock usage could possibly be optimised if necessary.
func (c *Cache) DeleteRange(keys []string, min, max int64) {
c.init()
c.mu.Lock()
defer c.mu.Unlock()
for _, k := range keys {
// Make sure key exist in the cache, skip if it does not
e, ok := c.store.entry(k)
if !ok {
continue
}
origSize := uint64(e.size())
if min == math.MinInt64 && max == math.MaxInt64 {
c.decreaseSize(origSize)
c.store.remove(k)
continue
}
e.filter(min, max)
if e.count() == 0 {
c.store.remove(k)
c.decreaseSize(origSize)
continue
}
c.decreaseSize(origSize - uint64(e.size()))
}
atomic.StoreInt64(&c.stats.MemSizeBytes, int64(c.Size()))
}
// SetMaxSize updates the memory limit of the cache.
func (c *Cache) SetMaxSize(size uint64) {
c.mu.Lock()
c.maxSize = size
c.mu.Unlock()
}
// values returns the values for the key. It assumes the data is already sorted.
// It doesn't lock the cache but it does read-lock the entry if there is one for the key.
// values should only be used in compact.go in the CacheKeyIterator.
func (c *Cache) values(key string) Values {
e, _ := c.store.entry(key)
if e == nil {
return nil
}
e.mu.RLock()
v := e.values
e.mu.RUnlock()
return v
}
// ApplyEntryFn applies the function f to each entry in the Cache.
// ApplyEntryFn calls f on each entry in turn, within the same goroutine.
// It is safe for use by multiple goroutines.
func (c *Cache) ApplyEntryFn(f func(key string, entry *entry) error) error {
c.mu.RLock()
store := c.store
c.mu.RUnlock()
return store.applySerial(f)
}
// CacheLoader processes a set of WAL segment files, and loads a cache with the data
// contained within those files. Processing of the supplied files take place in the
// order they exist in the files slice.
type CacheLoader struct {
files []string
Logger zap.Logger
}
// NewCacheLoader returns a new instance of a CacheLoader.
func NewCacheLoader(files []string) *CacheLoader {
return &CacheLoader{
files: files,
Logger: zap.New(zap.NullEncoder()),
}
}
// Load returns a cache loaded with the data contained within the segment files.
// If, during reading of a segment file, corruption is encountered, that segment
// file is truncated up to and including the last valid byte, and processing
// continues with the next segment file.
func (cl *CacheLoader) Load(cache *Cache) error {
var r *WALSegmentReader
for _, fn := range cl.files {
if err := func() error {
f, err := os.OpenFile(fn, os.O_CREATE|os.O_RDWR, 0666)
if err != nil {
return err
}
defer f.Close()
// Log some information about the segments.
stat, err := os.Stat(f.Name())
if err != nil {
return err
}
cl.Logger.Info(fmt.Sprintf("reading file %s, size %d", f.Name(), stat.Size()))
// Nothing to read, skip it
if stat.Size() == 0 {
return nil
}
if r == nil {
r = NewWALSegmentReader(f)
defer r.Close()
} else {
r.Reset(f)
}
for r.Next() {
entry, err := r.Read()
if err != nil {
n := r.Count()
cl.Logger.Info(fmt.Sprintf("file %s corrupt at position %d, truncating", f.Name(), n))
if err := f.Truncate(n); err != nil {
return err
}
break
}
switch t := entry.(type) {
case *WriteWALEntry:
if err := cache.WriteMulti(t.Values); err != nil {
return err
}
case *DeleteRangeWALEntry:
cache.DeleteRange(t.Keys, t.Min, t.Max)
case *DeleteWALEntry:
cache.Delete(t.Keys)
}
}
return r.Close()
}(); err != nil {
return err
}
}
return nil
}
// WithLogger sets the logger on the CacheLoader.
func (cl *CacheLoader) WithLogger(log zap.Logger) {
cl.Logger = log.With(zap.String("service", "cacheloader"))
}
// UpdateAge updates the age statistic based on the current time.
func (c *Cache) UpdateAge() {
c.mu.RLock()
defer c.mu.RUnlock()
ageStat := int64(time.Since(c.lastSnapshot) / time.Millisecond)
atomic.StoreInt64(&c.stats.CacheAgeMs, ageStat)
}
// UpdateCompactTime updates WAL compaction time statistic based on d.
func (c *Cache) UpdateCompactTime(d time.Duration) {
atomic.AddInt64(&c.stats.WALCompactionTimeMs, int64(d/time.Millisecond))
}
// updateCachedBytes increases the cachedBytes counter by b.
func (c *Cache) updateCachedBytes(b uint64) {
atomic.AddInt64(&c.stats.CachedBytes, int64(b))
}
// updateMemSize updates the memSize level by b.
func (c *Cache) updateMemSize(b int64) {
atomic.AddInt64(&c.stats.MemSizeBytes, b)
}
func valueType(v Value) int {
switch v.(type) {
case FloatValue:
return 1
case IntegerValue:
return 2
case StringValue:
return 3
case BooleanValue:
return 4
default:
return 0
}
}
// updateSnapshots updates the snapshotsCount and the diskSize levels.
func (c *Cache) updateSnapshots() {
// Update disk stats
atomic.StoreInt64(&c.stats.DiskSizeBytes, int64(atomic.LoadUint64(&c.snapshotSize)))
atomic.StoreInt64(&c.stats.SnapshotCount, int64(c.snapshotAttempts))
}
type emptyStore struct{}
func (e emptyStore) entry(key string) (*entry, bool) { return nil, false }
func (e emptyStore) write(key string, values Values) error { return nil }
func (e emptyStore) add(key string, entry *entry) {}
func (e emptyStore) remove(key string) {}
func (e emptyStore) keys(sorted bool) []string { return nil }
func (e emptyStore) apply(f func(string, *entry) error) error { return nil }
func (e emptyStore) applySerial(f func(string, *entry) error) error { return nil }
func (e emptyStore) reset() {}

View File

@@ -0,0 +1,206 @@
package tsm1_test
import (
"fmt"
"math/rand"
"sync"
"testing"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
)
func TestCacheCheckConcurrentReadsAreSafe(t *testing.T) {
values := make(tsm1.Values, 1000)
timestamps := make([]int64, len(values))
series := make([]string, 100)
for i := range timestamps {
timestamps[i] = int64(rand.Int63n(int64(len(values))))
}
for i := range values {
values[i] = tsm1.NewValue(timestamps[i*len(timestamps)/len(values)], float64(i))
}
for i := range series {
series[i] = fmt.Sprintf("series%d", i)
}
wg := sync.WaitGroup{}
c := tsm1.NewCache(1000000, "")
ch := make(chan struct{})
for _, s := range series {
for _, v := range values {
c.Write(s, tsm1.Values{v})
}
wg.Add(3)
go func(s string) {
defer wg.Done()
<-ch
c.Values(s)
}(s)
go func(s string) {
defer wg.Done()
<-ch
c.Values(s)
}(s)
go func(s string) {
defer wg.Done()
<-ch
c.Values(s)
}(s)
}
close(ch)
wg.Wait()
}
func TestCacheRace(t *testing.T) {
values := make(tsm1.Values, 1000)
timestamps := make([]int64, len(values))
series := make([]string, 100)
for i := range timestamps {
timestamps[i] = int64(rand.Int63n(int64(len(values))))
}
for i := range values {
values[i] = tsm1.NewValue(timestamps[i*len(timestamps)/len(values)], float64(i))
}
for i := range series {
series[i] = fmt.Sprintf("series%d", i)
}
wg := sync.WaitGroup{}
c := tsm1.NewCache(1000000, "")
ch := make(chan struct{})
for _, s := range series {
for _, v := range values {
c.Write(s, tsm1.Values{v})
}
wg.Add(1)
go func(s string) {
defer wg.Done()
<-ch
c.Values(s)
}(s)
}
errC := make(chan error)
wg.Add(1)
go func() {
defer wg.Done()
<-ch
s, err := c.Snapshot()
if err == tsm1.ErrSnapshotInProgress {
return
}
if err != nil {
errC <- fmt.Errorf("failed to snapshot cache: %v", err)
return
}
s.Deduplicate()
c.ClearSnapshot(true)
}()
close(ch)
go func() {
wg.Wait()
close(errC)
}()
for err := range errC {
if err != nil {
t.Error(err)
}
}
}
func TestCacheRace2Compacters(t *testing.T) {
values := make(tsm1.Values, 1000)
timestamps := make([]int64, len(values))
series := make([]string, 100)
for i := range timestamps {
timestamps[i] = int64(rand.Int63n(int64(len(values))))
}
for i := range values {
values[i] = tsm1.NewValue(timestamps[i*len(timestamps)/len(values)], float64(i))
}
for i := range series {
series[i] = fmt.Sprintf("series%d", i)
}
wg := sync.WaitGroup{}
c := tsm1.NewCache(1000000, "")
ch := make(chan struct{})
for _, s := range series {
for _, v := range values {
c.Write(s, tsm1.Values{v})
}
wg.Add(1)
go func(s string) {
defer wg.Done()
<-ch
c.Values(s)
}(s)
}
fileCounter := 0
mapFiles := map[int]bool{}
mu := sync.Mutex{}
errC := make(chan error)
for i := 0; i < 2; i++ {
wg.Add(1)
go func() {
defer wg.Done()
<-ch
s, err := c.Snapshot()
if err == tsm1.ErrSnapshotInProgress {
return
}
if err != nil {
errC <- fmt.Errorf("failed to snapshot cache: %v", err)
return
}
mu.Lock()
mapFiles[fileCounter] = true
fileCounter++
myFiles := map[int]bool{}
for k, e := range mapFiles {
myFiles[k] = e
}
mu.Unlock()
s.Deduplicate()
c.ClearSnapshot(true)
mu.Lock()
defer mu.Unlock()
for k, _ := range myFiles {
if _, ok := mapFiles[k]; !ok {
errC <- fmt.Errorf("something else deleted one of my files")
return
} else {
delete(mapFiles, k)
}
}
}()
}
close(ch)
go func() {
wg.Wait()
close(errC)
}()
for err := range errC {
if err != nil {
t.Error(err)
}
}
}

View File

@@ -0,0 +1,883 @@
package tsm1
import (
"errors"
"fmt"
"io/ioutil"
"math"
"math/rand"
"os"
"reflect"
"runtime"
"strings"
"sync"
"sync/atomic"
"testing"
"github.com/golang/snappy"
)
func TestCache_NewCache(t *testing.T) {
c := NewCache(100, "")
if c == nil {
t.Fatalf("failed to create new cache")
}
if c.MaxSize() != 100 {
t.Fatalf("new cache max size not correct")
}
if c.Size() != 0 {
t.Fatalf("new cache size not correct")
}
if len(c.Keys()) != 0 {
t.Fatalf("new cache keys not correct: %v", c.Keys())
}
}
func TestCache_CacheWrite(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, 3.0)
values := Values{v0, v1, v2}
valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
c := NewCache(3*valuesSize, "")
if err := c.Write("foo", values); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if err := c.Write("bar", values); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if n := c.Size(); n != 2*valuesSize {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", 2*valuesSize, n)
}
if exp, keys := []string{"bar", "foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
}
func TestCache_CacheWrite_TypeConflict(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, int(64))
values := Values{v0, v1}
valuesSize := v0.Size() + v1.Size()
c := NewCache(uint64(2*valuesSize), "")
if err := c.Write("foo", values[:1]); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if err := c.Write("foo", values[1:]); err == nil {
t.Fatalf("expected field type conflict")
}
if exp, got := uint64(v0.Size()), c.Size(); exp != got {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", exp, got)
}
}
func TestCache_CacheWriteMulti(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, 3.0)
values := Values{v0, v1, v2}
valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
c := NewCache(30*valuesSize, "")
if err := c.WriteMulti(map[string][]Value{"foo": values, "bar": values}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if n := c.Size(); n != 2*valuesSize {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", 2*valuesSize, n)
}
if exp, keys := []string{"bar", "foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
}
// Tests that the cache stats and size are correctly maintained during writes.
func TestCache_WriteMulti_Stats(t *testing.T) {
limit := uint64(1)
c := NewCache(limit, "")
ms := NewTestStore()
c.store = ms
// Not enough room in the cache.
v := NewValue(1, 1.0)
values := map[string][]Value{"foo": []Value{v, v}}
if got, exp := c.WriteMulti(values), ErrCacheMemorySizeLimitExceeded(uint64(v.Size()*2), limit); !reflect.DeepEqual(got, exp) {
t.Fatalf("got %q, expected %q", got, exp)
}
// Fail one of the values in the write.
c = NewCache(50, "")
c.init()
c.store = ms
ms.writef = func(key string, v Values) error {
if key == "foo" {
return errors.New("write failed")
}
return nil
}
values = map[string][]Value{"foo": []Value{v, v}, "bar": []Value{v}}
if got, exp := c.WriteMulti(values), errors.New("write failed"); !reflect.DeepEqual(got, exp) {
t.Fatalf("got %v, expected %v", got, exp)
}
// Cache size decreased correctly.
if got, exp := c.Size(), uint64(16); got != exp {
t.Fatalf("got %v, expected %v", got, exp)
}
// Write stats updated
if got, exp := c.stats.WriteDropped, int64(1); got != exp {
t.Fatalf("got %v, expected %v", got, exp)
} else if got, exp := c.stats.WriteErr, int64(1); got != exp {
t.Fatalf("got %v, expected %v", got, exp)
}
}
func TestCache_CacheWriteMulti_TypeConflict(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, int64(3))
values := Values{v0, v1, v2}
valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
c := NewCache(3*valuesSize, "")
if err := c.WriteMulti(map[string][]Value{"foo": values[:1], "bar": values[1:]}); err == nil {
t.Fatalf(" expected field type conflict")
}
if exp, got := uint64(v0.Size()), c.Size(); exp != got {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", exp, got)
}
if exp, keys := []string{"foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
}
func TestCache_Cache_DeleteRange(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, 3.0)
values := Values{v0, v1, v2}
valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
c := NewCache(30*valuesSize, "")
if err := c.WriteMulti(map[string][]Value{"foo": values, "bar": values}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if n := c.Size(); n != 2*valuesSize {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", 2*valuesSize, n)
}
if exp, keys := []string{"bar", "foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
c.DeleteRange([]string{"bar"}, 2, math.MaxInt64)
if exp, keys := []string{"bar", "foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
if got, exp := c.Size(), valuesSize+uint64(v0.Size()); exp != got {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", exp, got)
}
if got, exp := len(c.Values("bar")), 1; got != exp {
t.Fatalf("cache values mismatch: got %v, exp %v", got, exp)
}
if got, exp := len(c.Values("foo")), 3; got != exp {
t.Fatalf("cache values mismatch: got %v, exp %v", got, exp)
}
}
func TestCache_DeleteRange_NoValues(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, 3.0)
values := Values{v0, v1, v2}
valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
c := NewCache(3*valuesSize, "")
if err := c.WriteMulti(map[string][]Value{"foo": values}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if n := c.Size(); n != valuesSize {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", 2*valuesSize, n)
}
if exp, keys := []string{"foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
c.DeleteRange([]string{"foo"}, math.MinInt64, math.MaxInt64)
if exp, keys := 0, len(c.Keys()); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
if got, exp := c.Size(), uint64(0); exp != got {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", exp, got)
}
if got, exp := len(c.Values("foo")), 0; got != exp {
t.Fatalf("cache values mismatch: got %v, exp %v", got, exp)
}
}
func TestCache_Cache_Delete(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, 3.0)
values := Values{v0, v1, v2}
valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
c := NewCache(30*valuesSize, "")
if err := c.WriteMulti(map[string][]Value{"foo": values, "bar": values}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if n := c.Size(); n != 2*valuesSize {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", 2*valuesSize, n)
}
if exp, keys := []string{"bar", "foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
c.Delete([]string{"bar"})
if exp, keys := []string{"foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
if got, exp := c.Size(), valuesSize; exp != got {
t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", exp, got)
}
if got, exp := len(c.Values("bar")), 0; got != exp {
t.Fatalf("cache values mismatch: got %v, exp %v", got, exp)
}
if got, exp := len(c.Values("foo")), 3; got != exp {
t.Fatalf("cache values mismatch: got %v, exp %v", got, exp)
}
}
func TestCache_Cache_Delete_NonExistent(t *testing.T) {
c := NewCache(1024, "")
c.Delete([]string{"bar"})
if got, exp := c.Size(), uint64(0); exp != got {
t.Fatalf("cache size incorrect exp %d, got %d", exp, got)
}
}
// This tests writing two batches to the same series. The first batch
// is sorted. The second batch is also sorted but contains duplicates.
func TestCache_CacheWriteMulti_Duplicates(t *testing.T) {
v0 := NewValue(2, 1.0)
v1 := NewValue(3, 1.0)
values0 := Values{v0, v1}
v3 := NewValue(4, 2.0)
v4 := NewValue(5, 3.0)
v5 := NewValue(5, 3.0)
values1 := Values{v3, v4, v5}
c := NewCache(0, "")
if err := c.WriteMulti(map[string][]Value{"foo": values0}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if err := c.WriteMulti(map[string][]Value{"foo": values1}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if exp, keys := []string{"foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys)
}
expAscValues := Values{v0, v1, v3, v5}
if exp, got := len(expAscValues), len(c.Values("foo")); exp != got {
t.Fatalf("value count mismatch: exp: %v, got %v", exp, got)
}
if deduped := c.Values("foo"); !reflect.DeepEqual(expAscValues, deduped) {
t.Fatalf("deduped ascending values for foo incorrect, exp: %v, got %v", expAscValues, deduped)
}
}
func TestCache_CacheValues(t *testing.T) {
v0 := NewValue(1, 0.0)
v1 := NewValue(2, 2.0)
v2 := NewValue(3, 3.0)
v3 := NewValue(1, 1.0)
v4 := NewValue(4, 4.0)
c := NewCache(512, "")
if deduped := c.Values("no such key"); deduped != nil {
t.Fatalf("Values returned for no such key")
}
if err := c.Write("foo", Values{v0, v1, v2, v3}); err != nil {
t.Fatalf("failed to write 3 values, key foo to cache: %s", err.Error())
}
if err := c.Write("foo", Values{v4}); err != nil {
t.Fatalf("failed to write 1 value, key foo to cache: %s", err.Error())
}
expAscValues := Values{v3, v1, v2, v4}
if deduped := c.Values("foo"); !reflect.DeepEqual(expAscValues, deduped) {
t.Fatalf("deduped ascending values for foo incorrect, exp: %v, got %v", expAscValues, deduped)
}
}
func TestCache_CacheSnapshot(t *testing.T) {
v0 := NewValue(2, 0.0)
v1 := NewValue(3, 2.0)
v2 := NewValue(4, 3.0)
v3 := NewValue(5, 4.0)
v4 := NewValue(6, 5.0)
v5 := NewValue(1, 5.0)
v6 := NewValue(7, 5.0)
v7 := NewValue(2, 5.0)
c := NewCache(512, "")
if err := c.Write("foo", Values{v0, v1, v2, v3}); err != nil {
t.Fatalf("failed to write 3 values, key foo to cache: %s", err.Error())
}
// Grab snapshot, and ensure it's as expected.
snapshot, err := c.Snapshot()
if err != nil {
t.Fatalf("failed to snapshot cache: %v", err)
}
expValues := Values{v0, v1, v2, v3}
if deduped := snapshot.values("foo"); !reflect.DeepEqual(expValues, deduped) {
t.Fatalf("snapshotted values for foo incorrect, exp: %v, got %v", expValues, deduped)
}
// Ensure cache is still as expected.
if deduped := c.Values("foo"); !reflect.DeepEqual(expValues, deduped) {
t.Fatalf("post-snapshot values for foo incorrect, exp: %v, got %v", expValues, deduped)
}
// Write a new value to the cache.
if err := c.Write("foo", Values{v4}); err != nil {
t.Fatalf("failed to write post-snap value, key foo to cache: %s", err.Error())
}
expValues = Values{v0, v1, v2, v3, v4}
if deduped := c.Values("foo"); !reflect.DeepEqual(expValues, deduped) {
t.Fatalf("post-snapshot write values for foo incorrect, exp: %v, got %v", expValues, deduped)
}
// Write a new, out-of-order, value to the cache.
if err := c.Write("foo", Values{v5}); err != nil {
t.Fatalf("failed to write post-snap value, key foo to cache: %s", err.Error())
}
expValues = Values{v5, v0, v1, v2, v3, v4}
if deduped := c.Values("foo"); !reflect.DeepEqual(expValues, deduped) {
t.Fatalf("post-snapshot out-of-order write values for foo incorrect, exp: %v, got %v", expValues, deduped)
}
// Clear snapshot, ensuring non-snapshot data untouched.
c.ClearSnapshot(true)
expValues = Values{v5, v4}
if deduped := c.Values("foo"); !reflect.DeepEqual(expValues, deduped) {
t.Fatalf("post-clear values for foo incorrect, exp: %v, got %v", expValues, deduped)
}
// Create another snapshot
snapshot, err = c.Snapshot()
if err != nil {
t.Fatalf("failed to snapshot cache: %v", err)
}
if err := c.Write("foo", Values{v4, v5}); err != nil {
t.Fatalf("failed to write post-snap value, key foo to cache: %s", err.Error())
}
c.ClearSnapshot(true)
snapshot, err = c.Snapshot()
if err != nil {
t.Fatalf("failed to snapshot cache: %v", err)
}
if err := c.Write("foo", Values{v6, v7}); err != nil {
t.Fatalf("failed to write post-snap value, key foo to cache: %s", err.Error())
}
expValues = Values{v5, v7, v4, v6}
if deduped := c.Values("foo"); !reflect.DeepEqual(expValues, deduped) {
t.Fatalf("post-snapshot out-of-order write values for foo incorrect, exp: %v, got %v", expValues, deduped)
}
}
// Tests that Snapshot updates statistics correctly.
func TestCache_Snapshot_Stats(t *testing.T) {
limit := uint64(16)
c := NewCache(limit, "")
values := map[string][]Value{"foo": []Value{NewValue(1, 1.0)}}
if err := c.WriteMulti(values); err != nil {
t.Fatal(err)
}
_, err := c.Snapshot()
if err != nil {
t.Fatal(err)
}
// Store size should have been reset.
if got, exp := c.Size(), uint64(16); got != exp {
t.Fatalf("got %v, expected %v", got, exp)
}
// Cached bytes should have been increased.
if got, exp := c.stats.CachedBytes, int64(16); got != exp {
t.Fatalf("got %v, expected %v", got, exp)
}
}
func TestCache_CacheEmptySnapshot(t *testing.T) {
c := NewCache(512, "")
// Grab snapshot, and ensure it's as expected.
snapshot, err := c.Snapshot()
if err != nil {
t.Fatalf("failed to snapshot cache: %v", err)
}
if deduped := snapshot.values("foo"); !reflect.DeepEqual(Values(nil), deduped) {
t.Fatalf("snapshotted values for foo incorrect, exp: %v, got %v", nil, deduped)
}
// Ensure cache is still as expected.
if deduped := c.Values("foo"); !reflect.DeepEqual(Values(nil), deduped) {
t.Fatalf("post-snapshotted values for foo incorrect, exp: %v, got %v", Values(nil), deduped)
}
// Clear snapshot.
c.ClearSnapshot(true)
if deduped := c.Values("foo"); !reflect.DeepEqual(Values(nil), deduped) {
t.Fatalf("post-snapshot-clear values for foo incorrect, exp: %v, got %v", Values(nil), deduped)
}
}
func TestCache_CacheWriteMemoryExceeded(t *testing.T) {
v0 := NewValue(1, 1.0)
v1 := NewValue(2, 2.0)
c := NewCache(uint64(v1.Size()), "")
if err := c.Write("foo", Values{v0}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
if exp, keys := []string{"foo"}, c.Keys(); !reflect.DeepEqual(keys, exp) {
t.Fatalf("cache keys incorrect after writes, exp %v, got %v", exp, keys)
}
if err := c.Write("bar", Values{v1}); err == nil || !strings.Contains(err.Error(), "cache-max-memory-size") {
t.Fatalf("wrong error writing key bar to cache: %v", err)
}
// Grab snapshot, write should still fail since we're still using the memory.
_, err := c.Snapshot()
if err != nil {
t.Fatalf("failed to snapshot cache: %v", err)
}
if err := c.Write("bar", Values{v1}); err == nil || !strings.Contains(err.Error(), "cache-max-memory-size") {
t.Fatalf("wrong error writing key bar to cache: %v", err)
}
// Clear the snapshot and the write should now succeed.
c.ClearSnapshot(true)
if err := c.Write("bar", Values{v1}); err != nil {
t.Fatalf("failed to write key foo to cache: %s", err.Error())
}
expAscValues := Values{v1}
if deduped := c.Values("bar"); !reflect.DeepEqual(expAscValues, deduped) {
t.Fatalf("deduped ascending values for bar incorrect, exp: %v, got %v", expAscValues, deduped)
}
}
func TestCache_Deduplicate_Concurrent(t *testing.T) {
if testing.Short() || os.Getenv("GORACE") != "" || os.Getenv("APPVEYOR") != "" {
t.Skip("Skipping test in short, race, appveyor mode.")
}
values := make(map[string][]Value)
for i := 0; i < 1000; i++ {
for j := 0; j < 100; j++ {
values[fmt.Sprintf("cpu%d", i)] = []Value{NewValue(int64(i+j)+int64(rand.Intn(10)), float64(i))}
}
}
wg := sync.WaitGroup{}
c := NewCache(1000000, "")
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < 1000; i++ {
c.WriteMulti(values)
}
}()
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < 1000; i++ {
c.Deduplicate()
}
}()
wg.Wait()
}
// Ensure the CacheLoader can correctly load from a single segment, even if it's corrupted.
func TestCacheLoader_LoadSingle(t *testing.T) {
// Create a WAL segment.
dir := mustTempDir()
defer os.RemoveAll(dir)
f := mustTempFile(dir)
w := NewWALSegmentWriter(f)
p1 := NewValue(1, 1.1)
p2 := NewValue(1, int64(1))
p3 := NewValue(1, true)
values := map[string][]Value{
"foo": []Value{p1},
"bar": []Value{p2},
"baz": []Value{p3},
}
entry := &WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
t.Fatal("write points", err)
}
if err := w.Flush(); err != nil {
t.Fatalf("flush error: %v", err)
}
// Load the cache using the segment.
cache := NewCache(1024, "")
loader := NewCacheLoader([]string{f.Name()})
if err := loader.Load(cache); err != nil {
t.Fatalf("failed to load cache: %s", err.Error())
}
// Check the cache.
if values := cache.Values("foo"); !reflect.DeepEqual(values, Values{p1}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p1})
}
if values := cache.Values("bar"); !reflect.DeepEqual(values, Values{p2}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p2})
}
if values := cache.Values("baz"); !reflect.DeepEqual(values, Values{p3}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p3})
}
// Corrupt the WAL segment.
if _, err := f.Write([]byte{1, 4, 0, 0, 0}); err != nil {
t.Fatalf("corrupt WAL segment: %s", err.Error())
}
// Reload the cache using the segment.
cache = NewCache(1024, "")
loader = NewCacheLoader([]string{f.Name()})
if err := loader.Load(cache); err != nil {
t.Fatalf("failed to load cache: %s", err.Error())
}
// Check the cache.
if values := cache.Values("foo"); !reflect.DeepEqual(values, Values{p1}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p1})
}
if values := cache.Values("bar"); !reflect.DeepEqual(values, Values{p2}) {
t.Fatalf("cache key bar not as expected, got %v, exp %v", values, Values{p2})
}
if values := cache.Values("baz"); !reflect.DeepEqual(values, Values{p3}) {
t.Fatalf("cache key baz not as expected, got %v, exp %v", values, Values{p3})
}
}
// Ensure the CacheLoader can correctly load from two segments, even if one is corrupted.
func TestCacheLoader_LoadDouble(t *testing.T) {
// Create a WAL segment.
dir := mustTempDir()
defer os.RemoveAll(dir)
f1, f2 := mustTempFile(dir), mustTempFile(dir)
w1, w2 := NewWALSegmentWriter(f1), NewWALSegmentWriter(f2)
p1 := NewValue(1, 1.1)
p2 := NewValue(1, int64(1))
p3 := NewValue(1, true)
p4 := NewValue(1, "string")
// Write first and second segment.
segmentWrite := func(w *WALSegmentWriter, values map[string][]Value) {
entry := &WriteWALEntry{
Values: values,
}
if err := w1.Write(mustMarshalEntry(entry)); err != nil {
t.Fatal("write points", err)
}
if err := w1.Flush(); err != nil {
t.Fatalf("flush error: %v", err)
}
}
values := map[string][]Value{
"foo": []Value{p1},
"bar": []Value{p2},
}
segmentWrite(w1, values)
values = map[string][]Value{
"baz": []Value{p3},
"qux": []Value{p4},
}
segmentWrite(w2, values)
// Corrupt the first WAL segment.
if _, err := f1.Write([]byte{1, 4, 0, 0, 0}); err != nil {
t.Fatalf("corrupt WAL segment: %s", err.Error())
}
// Load the cache using the segments.
cache := NewCache(1024, "")
loader := NewCacheLoader([]string{f1.Name(), f2.Name()})
if err := loader.Load(cache); err != nil {
t.Fatalf("failed to load cache: %s", err.Error())
}
// Check the cache.
if values := cache.Values("foo"); !reflect.DeepEqual(values, Values{p1}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p1})
}
if values := cache.Values("bar"); !reflect.DeepEqual(values, Values{p2}) {
t.Fatalf("cache key bar not as expected, got %v, exp %v", values, Values{p2})
}
if values := cache.Values("baz"); !reflect.DeepEqual(values, Values{p3}) {
t.Fatalf("cache key baz not as expected, got %v, exp %v", values, Values{p3})
}
if values := cache.Values("qux"); !reflect.DeepEqual(values, Values{p4}) {
t.Fatalf("cache key qux not as expected, got %v, exp %v", values, Values{p4})
}
}
// Ensure the CacheLoader can load deleted series
func TestCacheLoader_LoadDeleted(t *testing.T) {
// Create a WAL segment.
dir := mustTempDir()
defer os.RemoveAll(dir)
f := mustTempFile(dir)
w := NewWALSegmentWriter(f)
p1 := NewValue(1, 1.0)
p2 := NewValue(2, 2.0)
p3 := NewValue(3, 3.0)
values := map[string][]Value{
"foo": []Value{p1, p2, p3},
}
entry := &WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
t.Fatal("write points", err)
}
if err := w.Flush(); err != nil {
t.Fatalf("flush error: %v", err)
}
dentry := &DeleteRangeWALEntry{
Keys: []string{"foo"},
Min: 2,
Max: 3,
}
if err := w.Write(mustMarshalEntry(dentry)); err != nil {
t.Fatal("write points", err)
}
if err := w.Flush(); err != nil {
t.Fatalf("flush error: %v", err)
}
// Load the cache using the segment.
cache := NewCache(1024, "")
loader := NewCacheLoader([]string{f.Name()})
if err := loader.Load(cache); err != nil {
t.Fatalf("failed to load cache: %s", err.Error())
}
// Check the cache.
if values := cache.Values("foo"); !reflect.DeepEqual(values, Values{p1}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p1})
}
// Reload the cache using the segment.
cache = NewCache(1024, "")
loader = NewCacheLoader([]string{f.Name()})
if err := loader.Load(cache); err != nil {
t.Fatalf("failed to load cache: %s", err.Error())
}
// Check the cache.
if values := cache.Values("foo"); !reflect.DeepEqual(values, Values{p1}) {
t.Fatalf("cache key foo not as expected, got %v, exp %v", values, Values{p1})
}
}
func mustTempDir() string {
dir, err := ioutil.TempDir("", "tsm1-test")
if err != nil {
panic(fmt.Sprintf("failed to create temp dir: %v", err))
}
return dir
}
func mustTempFile(dir string) *os.File {
f, err := ioutil.TempFile(dir, "tsm1test")
if err != nil {
panic(fmt.Sprintf("failed to create temp file: %v", err))
}
return f
}
func mustMarshalEntry(entry WALEntry) (WalEntryType, []byte) {
bytes := make([]byte, 1024<<2)
b, err := entry.Encode(bytes)
if err != nil {
panic(fmt.Sprintf("error encoding: %v", err))
}
return entry.Type(), snappy.Encode(b, b)
}
// TestStore implements the storer interface and can be used to mock out a
// Cache's storer implememation.
type TestStore struct {
entryf func(key string) (*entry, bool)
writef func(key string, values Values) error
addf func(key string, entry *entry)
removef func(key string)
keysf func(sorted bool) []string
applyf func(f func(string, *entry) error) error
applySerialf func(f func(string, *entry) error) error
resetf func()
}
func NewTestStore() *TestStore { return &TestStore{} }
func (s *TestStore) entry(key string) (*entry, bool) { return s.entryf(key) }
func (s *TestStore) write(key string, values Values) error { return s.writef(key, values) }
func (s *TestStore) add(key string, entry *entry) { s.addf(key, entry) }
func (s *TestStore) remove(key string) { s.removef(key) }
func (s *TestStore) keys(sorted bool) []string { return s.keysf(sorted) }
func (s *TestStore) apply(f func(string, *entry) error) error { return s.applyf(f) }
func (s *TestStore) applySerial(f func(string, *entry) error) error { return s.applySerialf(f) }
func (s *TestStore) reset() { s.resetf() }
var fvSize = uint64(NewValue(1, float64(1)).Size())
func BenchmarkCacheFloatEntries(b *testing.B) {
cache := NewCache(uint64(b.N)*fvSize, "")
vals := make([][]Value, b.N)
for i := 0; i < b.N; i++ {
vals[i] = []Value{NewValue(1, float64(i))}
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
if err := cache.Write("test", vals[i]); err != nil {
b.Fatal("err:", err, "i:", i, "N:", b.N)
}
}
}
type points struct {
key string
vals []Value
}
func BenchmarkCacheParallelFloatEntries(b *testing.B) {
c := b.N * runtime.GOMAXPROCS(0)
cache := NewCache(uint64(c)*fvSize*10, "")
vals := make([]points, c)
for i := 0; i < c; i++ {
v := make([]Value, 10)
for j := 0; j < 10; j++ {
v[j] = NewValue(1, float64(i+j))
}
vals[i] = points{key: fmt.Sprintf("cpu%v", rand.Intn(20)), vals: v}
}
i := int32(-1)
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
j := atomic.AddInt32(&i, 1)
v := vals[j]
if err := cache.Write(v.key, v.vals); err != nil {
b.Fatal("err:", err, "j:", j, "N:", b.N)
}
}
})
}
func BenchmarkEntry_add(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
b.StopTimer()
values := make([]Value, 10)
for i := 0; i < 10; i++ {
values[i] = NewValue(int64(i+1), float64(i))
}
otherValues := make([]Value, 10)
for i := 0; i < 10; i++ {
otherValues[i] = NewValue(1, float64(i))
}
entry, err := newEntryValues(values, 0) // Will use default allocation size.
if err != nil {
b.Fatal(err)
}
b.StartTimer()
if err := entry.add(otherValues); err != nil {
b.Fatal(err)
}
}
})
}

View File

@@ -0,0 +1,867 @@
// Generated by tmpl
// https://github.com/benbjohnson/tmpl
//
// DO NOT EDIT!
// Source: compact.gen.go.tmpl
package tsm1
import (
"runtime"
)
// merge combines the next set of blocks into merged blocks.
func (k *tsmKeyIterator) mergeFloat() {
// No blocks left, or pending merged values, we're done
if len(k.blocks) == 0 && len(k.merged) == 0 && len(k.mergedFloatValues) == 0 {
return
}
dedup := len(k.mergedFloatValues) != 0
if len(k.blocks) > 0 && !dedup {
// If we have more than one block or any partially tombstoned blocks, we many need to dedup
dedup = len(k.blocks[0].tombstones) > 0 || k.blocks[0].partiallyRead()
// Quickly scan each block to see if any overlap with the prior block, if they overlap then
// we need to dedup as there may be duplicate points now
for i := 1; !dedup && i < len(k.blocks); i++ {
if k.blocks[i].partiallyRead() {
dedup = true
break
}
if k.blocks[i].minTime <= k.blocks[i-1].maxTime || len(k.blocks[i].tombstones) > 0 {
dedup = true
break
}
}
}
k.merged = k.combineFloat(dedup)
}
// combine returns a new set of blocks using the current blocks in the buffers. If dedup
// is true, all the blocks will be decoded, dedup and sorted in in order. If dedup is false,
// only blocks that are smaller than the chunk size will be decoded and combined.
func (k *tsmKeyIterator) combineFloat(dedup bool) blocks {
if dedup {
for len(k.mergedFloatValues) < k.size && len(k.blocks) > 0 {
for len(k.blocks) > 0 && k.blocks[0].read() {
k.blocks = k.blocks[1:]
}
if len(k.blocks) == 0 {
break
}
first := k.blocks[0]
minTime := first.minTime
maxTime := first.maxTime
// Adjust the min time to the start of any overlapping blocks.
for i := 0; i < len(k.blocks); i++ {
if k.blocks[i].overlapsTimeRange(minTime, maxTime) && !k.blocks[i].read() {
if k.blocks[i].minTime < minTime {
minTime = k.blocks[i].minTime
}
if k.blocks[i].maxTime > minTime && k.blocks[i].maxTime < maxTime {
maxTime = k.blocks[i].maxTime
}
}
}
// We have some overlapping blocks so decode all, append in order and then dedup
for i := 0; i < len(k.blocks); i++ {
if !k.blocks[i].overlapsTimeRange(minTime, maxTime) || k.blocks[i].read() {
continue
}
v, err := DecodeFloatBlock(k.blocks[i].b, &[]FloatValue{})
if err != nil {
k.err = err
return nil
}
// Remove values we already read
v = FloatValues(v).Exclude(k.blocks[i].readMin, k.blocks[i].readMax)
// Filter out only the values for overlapping block
v = FloatValues(v).Include(minTime, maxTime)
if len(v) > 0 {
// Record that we read a subset of the block
k.blocks[i].markRead(v[0].UnixNano(), v[len(v)-1].UnixNano())
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = FloatValues(v).Exclude(ts.Min, ts.Max)
}
k.mergedFloatValues = k.mergedFloatValues.Merge(v)
// Allow other goroutines to run
runtime.Gosched()
}
}
// Since we combined multiple blocks, we could have more values than we should put into
// a single block. We need to chunk them up into groups and re-encode them.
return k.chunkFloat(nil)
} else {
var chunked blocks
var i int
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
// If we this block is already full, just add it as is
if BlockCount(k.blocks[i].b) >= k.size {
chunked = append(chunked, k.blocks[i])
} else {
break
}
i++
// Allow other goroutines to run
runtime.Gosched()
}
if k.fast {
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
chunked = append(chunked, k.blocks[i])
i++
// Allow other goroutines to run
runtime.Gosched()
}
}
// If we only have 1 blocks left, just append it as is and avoid decoding/recoding
if i == len(k.blocks)-1 {
if !k.blocks[i].read() {
chunked = append(chunked, k.blocks[i])
}
i++
}
// The remaining blocks can be combined and we know that they do not overlap and
// so we can just append each, sort and re-encode.
for i < len(k.blocks) && len(k.mergedFloatValues) < k.size {
if k.blocks[i].read() {
i++
continue
}
v, err := DecodeFloatBlock(k.blocks[i].b, &[]FloatValue{})
if err != nil {
k.err = err
return nil
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = FloatValues(v).Exclude(ts.Min, ts.Max)
}
k.blocks[i].markRead(k.blocks[i].minTime, k.blocks[i].maxTime)
k.mergedFloatValues = k.mergedFloatValues.Merge(v)
i++
// Allow other goroutines to run
runtime.Gosched()
}
k.blocks = k.blocks[i:]
return k.chunkFloat(chunked)
}
}
func (k *tsmKeyIterator) chunkFloat(dst blocks) blocks {
if len(k.mergedFloatValues) > k.size {
values := k.mergedFloatValues[:k.size]
cb, err := FloatValues(values).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: values[0].UnixNano(),
maxTime: values[len(values)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedFloatValues = k.mergedFloatValues[k.size:]
return dst
}
// Re-encode the remaining values into the last block
if len(k.mergedFloatValues) > 0 {
cb, err := FloatValues(k.mergedFloatValues).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: k.mergedFloatValues[0].UnixNano(),
maxTime: k.mergedFloatValues[len(k.mergedFloatValues)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedFloatValues = k.mergedFloatValues[:0]
}
return dst
}
// merge combines the next set of blocks into merged blocks.
func (k *tsmKeyIterator) mergeInteger() {
// No blocks left, or pending merged values, we're done
if len(k.blocks) == 0 && len(k.merged) == 0 && len(k.mergedIntegerValues) == 0 {
return
}
dedup := len(k.mergedIntegerValues) != 0
if len(k.blocks) > 0 && !dedup {
// If we have more than one block or any partially tombstoned blocks, we many need to dedup
dedup = len(k.blocks[0].tombstones) > 0 || k.blocks[0].partiallyRead()
// Quickly scan each block to see if any overlap with the prior block, if they overlap then
// we need to dedup as there may be duplicate points now
for i := 1; !dedup && i < len(k.blocks); i++ {
if k.blocks[i].partiallyRead() {
dedup = true
break
}
if k.blocks[i].minTime <= k.blocks[i-1].maxTime || len(k.blocks[i].tombstones) > 0 {
dedup = true
break
}
}
}
k.merged = k.combineInteger(dedup)
}
// combine returns a new set of blocks using the current blocks in the buffers. If dedup
// is true, all the blocks will be decoded, dedup and sorted in in order. If dedup is false,
// only blocks that are smaller than the chunk size will be decoded and combined.
func (k *tsmKeyIterator) combineInteger(dedup bool) blocks {
if dedup {
for len(k.mergedIntegerValues) < k.size && len(k.blocks) > 0 {
for len(k.blocks) > 0 && k.blocks[0].read() {
k.blocks = k.blocks[1:]
}
if len(k.blocks) == 0 {
break
}
first := k.blocks[0]
minTime := first.minTime
maxTime := first.maxTime
// Adjust the min time to the start of any overlapping blocks.
for i := 0; i < len(k.blocks); i++ {
if k.blocks[i].overlapsTimeRange(minTime, maxTime) && !k.blocks[i].read() {
if k.blocks[i].minTime < minTime {
minTime = k.blocks[i].minTime
}
if k.blocks[i].maxTime > minTime && k.blocks[i].maxTime < maxTime {
maxTime = k.blocks[i].maxTime
}
}
}
// We have some overlapping blocks so decode all, append in order and then dedup
for i := 0; i < len(k.blocks); i++ {
if !k.blocks[i].overlapsTimeRange(minTime, maxTime) || k.blocks[i].read() {
continue
}
v, err := DecodeIntegerBlock(k.blocks[i].b, &[]IntegerValue{})
if err != nil {
k.err = err
return nil
}
// Remove values we already read
v = IntegerValues(v).Exclude(k.blocks[i].readMin, k.blocks[i].readMax)
// Filter out only the values for overlapping block
v = IntegerValues(v).Include(minTime, maxTime)
if len(v) > 0 {
// Record that we read a subset of the block
k.blocks[i].markRead(v[0].UnixNano(), v[len(v)-1].UnixNano())
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = IntegerValues(v).Exclude(ts.Min, ts.Max)
}
k.mergedIntegerValues = k.mergedIntegerValues.Merge(v)
// Allow other goroutines to run
runtime.Gosched()
}
}
// Since we combined multiple blocks, we could have more values than we should put into
// a single block. We need to chunk them up into groups and re-encode them.
return k.chunkInteger(nil)
} else {
var chunked blocks
var i int
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
// If we this block is already full, just add it as is
if BlockCount(k.blocks[i].b) >= k.size {
chunked = append(chunked, k.blocks[i])
} else {
break
}
i++
// Allow other goroutines to run
runtime.Gosched()
}
if k.fast {
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
chunked = append(chunked, k.blocks[i])
i++
// Allow other goroutines to run
runtime.Gosched()
}
}
// If we only have 1 blocks left, just append it as is and avoid decoding/recoding
if i == len(k.blocks)-1 {
if !k.blocks[i].read() {
chunked = append(chunked, k.blocks[i])
}
i++
}
// The remaining blocks can be combined and we know that they do not overlap and
// so we can just append each, sort and re-encode.
for i < len(k.blocks) && len(k.mergedIntegerValues) < k.size {
if k.blocks[i].read() {
i++
continue
}
v, err := DecodeIntegerBlock(k.blocks[i].b, &[]IntegerValue{})
if err != nil {
k.err = err
return nil
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = IntegerValues(v).Exclude(ts.Min, ts.Max)
}
k.blocks[i].markRead(k.blocks[i].minTime, k.blocks[i].maxTime)
k.mergedIntegerValues = k.mergedIntegerValues.Merge(v)
i++
// Allow other goroutines to run
runtime.Gosched()
}
k.blocks = k.blocks[i:]
return k.chunkInteger(chunked)
}
}
func (k *tsmKeyIterator) chunkInteger(dst blocks) blocks {
if len(k.mergedIntegerValues) > k.size {
values := k.mergedIntegerValues[:k.size]
cb, err := IntegerValues(values).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: values[0].UnixNano(),
maxTime: values[len(values)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedIntegerValues = k.mergedIntegerValues[k.size:]
return dst
}
// Re-encode the remaining values into the last block
if len(k.mergedIntegerValues) > 0 {
cb, err := IntegerValues(k.mergedIntegerValues).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: k.mergedIntegerValues[0].UnixNano(),
maxTime: k.mergedIntegerValues[len(k.mergedIntegerValues)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedIntegerValues = k.mergedIntegerValues[:0]
}
return dst
}
// merge combines the next set of blocks into merged blocks.
func (k *tsmKeyIterator) mergeString() {
// No blocks left, or pending merged values, we're done
if len(k.blocks) == 0 && len(k.merged) == 0 && len(k.mergedStringValues) == 0 {
return
}
dedup := len(k.mergedStringValues) != 0
if len(k.blocks) > 0 && !dedup {
// If we have more than one block or any partially tombstoned blocks, we many need to dedup
dedup = len(k.blocks[0].tombstones) > 0 || k.blocks[0].partiallyRead()
// Quickly scan each block to see if any overlap with the prior block, if they overlap then
// we need to dedup as there may be duplicate points now
for i := 1; !dedup && i < len(k.blocks); i++ {
if k.blocks[i].partiallyRead() {
dedup = true
break
}
if k.blocks[i].minTime <= k.blocks[i-1].maxTime || len(k.blocks[i].tombstones) > 0 {
dedup = true
break
}
}
}
k.merged = k.combineString(dedup)
}
// combine returns a new set of blocks using the current blocks in the buffers. If dedup
// is true, all the blocks will be decoded, dedup and sorted in in order. If dedup is false,
// only blocks that are smaller than the chunk size will be decoded and combined.
func (k *tsmKeyIterator) combineString(dedup bool) blocks {
if dedup {
for len(k.mergedStringValues) < k.size && len(k.blocks) > 0 {
for len(k.blocks) > 0 && k.blocks[0].read() {
k.blocks = k.blocks[1:]
}
if len(k.blocks) == 0 {
break
}
first := k.blocks[0]
minTime := first.minTime
maxTime := first.maxTime
// Adjust the min time to the start of any overlapping blocks.
for i := 0; i < len(k.blocks); i++ {
if k.blocks[i].overlapsTimeRange(minTime, maxTime) && !k.blocks[i].read() {
if k.blocks[i].minTime < minTime {
minTime = k.blocks[i].minTime
}
if k.blocks[i].maxTime > minTime && k.blocks[i].maxTime < maxTime {
maxTime = k.blocks[i].maxTime
}
}
}
// We have some overlapping blocks so decode all, append in order and then dedup
for i := 0; i < len(k.blocks); i++ {
if !k.blocks[i].overlapsTimeRange(minTime, maxTime) || k.blocks[i].read() {
continue
}
v, err := DecodeStringBlock(k.blocks[i].b, &[]StringValue{})
if err != nil {
k.err = err
return nil
}
// Remove values we already read
v = StringValues(v).Exclude(k.blocks[i].readMin, k.blocks[i].readMax)
// Filter out only the values for overlapping block
v = StringValues(v).Include(minTime, maxTime)
if len(v) > 0 {
// Record that we read a subset of the block
k.blocks[i].markRead(v[0].UnixNano(), v[len(v)-1].UnixNano())
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = StringValues(v).Exclude(ts.Min, ts.Max)
}
k.mergedStringValues = k.mergedStringValues.Merge(v)
// Allow other goroutines to run
runtime.Gosched()
}
}
// Since we combined multiple blocks, we could have more values than we should put into
// a single block. We need to chunk them up into groups and re-encode them.
return k.chunkString(nil)
} else {
var chunked blocks
var i int
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
// If we this block is already full, just add it as is
if BlockCount(k.blocks[i].b) >= k.size {
chunked = append(chunked, k.blocks[i])
} else {
break
}
i++
// Allow other goroutines to run
runtime.Gosched()
}
if k.fast {
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
chunked = append(chunked, k.blocks[i])
i++
// Allow other goroutines to run
runtime.Gosched()
}
}
// If we only have 1 blocks left, just append it as is and avoid decoding/recoding
if i == len(k.blocks)-1 {
if !k.blocks[i].read() {
chunked = append(chunked, k.blocks[i])
}
i++
}
// The remaining blocks can be combined and we know that they do not overlap and
// so we can just append each, sort and re-encode.
for i < len(k.blocks) && len(k.mergedStringValues) < k.size {
if k.blocks[i].read() {
i++
continue
}
v, err := DecodeStringBlock(k.blocks[i].b, &[]StringValue{})
if err != nil {
k.err = err
return nil
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = StringValues(v).Exclude(ts.Min, ts.Max)
}
k.blocks[i].markRead(k.blocks[i].minTime, k.blocks[i].maxTime)
k.mergedStringValues = k.mergedStringValues.Merge(v)
i++
// Allow other goroutines to run
runtime.Gosched()
}
k.blocks = k.blocks[i:]
return k.chunkString(chunked)
}
}
func (k *tsmKeyIterator) chunkString(dst blocks) blocks {
if len(k.mergedStringValues) > k.size {
values := k.mergedStringValues[:k.size]
cb, err := StringValues(values).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: values[0].UnixNano(),
maxTime: values[len(values)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedStringValues = k.mergedStringValues[k.size:]
return dst
}
// Re-encode the remaining values into the last block
if len(k.mergedStringValues) > 0 {
cb, err := StringValues(k.mergedStringValues).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: k.mergedStringValues[0].UnixNano(),
maxTime: k.mergedStringValues[len(k.mergedStringValues)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedStringValues = k.mergedStringValues[:0]
}
return dst
}
// merge combines the next set of blocks into merged blocks.
func (k *tsmKeyIterator) mergeBoolean() {
// No blocks left, or pending merged values, we're done
if len(k.blocks) == 0 && len(k.merged) == 0 && len(k.mergedBooleanValues) == 0 {
return
}
dedup := len(k.mergedBooleanValues) != 0
if len(k.blocks) > 0 && !dedup {
// If we have more than one block or any partially tombstoned blocks, we many need to dedup
dedup = len(k.blocks[0].tombstones) > 0 || k.blocks[0].partiallyRead()
// Quickly scan each block to see if any overlap with the prior block, if they overlap then
// we need to dedup as there may be duplicate points now
for i := 1; !dedup && i < len(k.blocks); i++ {
if k.blocks[i].partiallyRead() {
dedup = true
break
}
if k.blocks[i].minTime <= k.blocks[i-1].maxTime || len(k.blocks[i].tombstones) > 0 {
dedup = true
break
}
}
}
k.merged = k.combineBoolean(dedup)
}
// combine returns a new set of blocks using the current blocks in the buffers. If dedup
// is true, all the blocks will be decoded, dedup and sorted in in order. If dedup is false,
// only blocks that are smaller than the chunk size will be decoded and combined.
func (k *tsmKeyIterator) combineBoolean(dedup bool) blocks {
if dedup {
for len(k.mergedBooleanValues) < k.size && len(k.blocks) > 0 {
for len(k.blocks) > 0 && k.blocks[0].read() {
k.blocks = k.blocks[1:]
}
if len(k.blocks) == 0 {
break
}
first := k.blocks[0]
minTime := first.minTime
maxTime := first.maxTime
// Adjust the min time to the start of any overlapping blocks.
for i := 0; i < len(k.blocks); i++ {
if k.blocks[i].overlapsTimeRange(minTime, maxTime) && !k.blocks[i].read() {
if k.blocks[i].minTime < minTime {
minTime = k.blocks[i].minTime
}
if k.blocks[i].maxTime > minTime && k.blocks[i].maxTime < maxTime {
maxTime = k.blocks[i].maxTime
}
}
}
// We have some overlapping blocks so decode all, append in order and then dedup
for i := 0; i < len(k.blocks); i++ {
if !k.blocks[i].overlapsTimeRange(minTime, maxTime) || k.blocks[i].read() {
continue
}
v, err := DecodeBooleanBlock(k.blocks[i].b, &[]BooleanValue{})
if err != nil {
k.err = err
return nil
}
// Remove values we already read
v = BooleanValues(v).Exclude(k.blocks[i].readMin, k.blocks[i].readMax)
// Filter out only the values for overlapping block
v = BooleanValues(v).Include(minTime, maxTime)
if len(v) > 0 {
// Record that we read a subset of the block
k.blocks[i].markRead(v[0].UnixNano(), v[len(v)-1].UnixNano())
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = BooleanValues(v).Exclude(ts.Min, ts.Max)
}
k.mergedBooleanValues = k.mergedBooleanValues.Merge(v)
// Allow other goroutines to run
runtime.Gosched()
}
}
// Since we combined multiple blocks, we could have more values than we should put into
// a single block. We need to chunk them up into groups and re-encode them.
return k.chunkBoolean(nil)
} else {
var chunked blocks
var i int
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
// If we this block is already full, just add it as is
if BlockCount(k.blocks[i].b) >= k.size {
chunked = append(chunked, k.blocks[i])
} else {
break
}
i++
// Allow other goroutines to run
runtime.Gosched()
}
if k.fast {
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
chunked = append(chunked, k.blocks[i])
i++
// Allow other goroutines to run
runtime.Gosched()
}
}
// If we only have 1 blocks left, just append it as is and avoid decoding/recoding
if i == len(k.blocks)-1 {
if !k.blocks[i].read() {
chunked = append(chunked, k.blocks[i])
}
i++
}
// The remaining blocks can be combined and we know that they do not overlap and
// so we can just append each, sort and re-encode.
for i < len(k.blocks) && len(k.mergedBooleanValues) < k.size {
if k.blocks[i].read() {
i++
continue
}
v, err := DecodeBooleanBlock(k.blocks[i].b, &[]BooleanValue{})
if err != nil {
k.err = err
return nil
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = BooleanValues(v).Exclude(ts.Min, ts.Max)
}
k.blocks[i].markRead(k.blocks[i].minTime, k.blocks[i].maxTime)
k.mergedBooleanValues = k.mergedBooleanValues.Merge(v)
i++
// Allow other goroutines to run
runtime.Gosched()
}
k.blocks = k.blocks[i:]
return k.chunkBoolean(chunked)
}
}
func (k *tsmKeyIterator) chunkBoolean(dst blocks) blocks {
if len(k.mergedBooleanValues) > k.size {
values := k.mergedBooleanValues[:k.size]
cb, err := BooleanValues(values).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: values[0].UnixNano(),
maxTime: values[len(values)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedBooleanValues = k.mergedBooleanValues[k.size:]
return dst
}
// Re-encode the remaining values into the last block
if len(k.mergedBooleanValues) > 0 {
cb, err := BooleanValues(k.mergedBooleanValues).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: k.mergedBooleanValues[0].UnixNano(),
maxTime: k.mergedBooleanValues[len(k.mergedBooleanValues)-1].UnixNano(),
key: k.key,
b: cb,
})
k.mergedBooleanValues = k.mergedBooleanValues[:0]
}
return dst
}

View File

@@ -0,0 +1,223 @@
package tsm1
import (
"runtime"
)
{{range .}}
// merge combines the next set of blocks into merged blocks.
func (k *tsmKeyIterator) merge{{.Name}}() {
// No blocks left, or pending merged values, we're done
if len(k.blocks) == 0 && len(k.merged) == 0 && len(k.merged{{.Name}}Values) == 0 {
return
}
dedup := len(k.merged{{.Name}}Values) != 0
if len(k.blocks) > 0 && !dedup {
// If we have more than one block or any partially tombstoned blocks, we many need to dedup
dedup = len(k.blocks[0].tombstones) > 0 || k.blocks[0].partiallyRead()
// Quickly scan each block to see if any overlap with the prior block, if they overlap then
// we need to dedup as there may be duplicate points now
for i := 1; !dedup && i < len(k.blocks); i++ {
if k.blocks[i].partiallyRead() {
dedup = true
break
}
if k.blocks[i].minTime <= k.blocks[i-1].maxTime || len(k.blocks[i].tombstones) > 0 {
dedup = true
break
}
}
}
k.merged = k.combine{{.Name}}(dedup)
}
// combine returns a new set of blocks using the current blocks in the buffers. If dedup
// is true, all the blocks will be decoded, dedup and sorted in in order. If dedup is false,
// only blocks that are smaller than the chunk size will be decoded and combined.
func (k *tsmKeyIterator) combine{{.Name}}(dedup bool) blocks {
if dedup {
for len(k.merged{{.Name}}Values) < k.size && len(k.blocks) > 0 {
for len(k.blocks) > 0 && k.blocks[0].read() {
k.blocks = k.blocks[1:]
}
if len(k.blocks) == 0 {
break
}
first := k.blocks[0]
minTime := first.minTime
maxTime := first.maxTime
// Adjust the min time to the start of any overlapping blocks.
for i := 0; i < len(k.blocks); i++ {
if k.blocks[i].overlapsTimeRange(minTime, maxTime) && !k.blocks[i].read() {
if k.blocks[i].minTime < minTime {
minTime = k.blocks[i].minTime
}
if k.blocks[i].maxTime > minTime && k.blocks[i].maxTime < maxTime {
maxTime = k.blocks[i].maxTime
}
}
}
// We have some overlapping blocks so decode all, append in order and then dedup
for i := 0; i < len(k.blocks); i++ {
if !k.blocks[i].overlapsTimeRange(minTime, maxTime) || k.blocks[i].read() {
continue
}
v, err := Decode{{.Name}}Block(k.blocks[i].b, &[]{{.Name}}Value{})
if err != nil {
k.err = err
return nil
}
// Remove values we already read
v = {{.Name}}Values(v).Exclude(k.blocks[i].readMin, k.blocks[i].readMax)
// Filter out only the values for overlapping block
v = {{.Name}}Values(v).Include(minTime, maxTime)
if len(v) > 0 {
// Record that we read a subset of the block
k.blocks[i].markRead(v[0].UnixNano(), v[len(v)-1].UnixNano())
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = {{.Name}}Values(v).Exclude(ts.Min, ts.Max)
}
k.merged{{.Name}}Values = k.merged{{.Name}}Values.Merge(v)
// Allow other goroutines to run
runtime.Gosched()
}
}
// Since we combined multiple blocks, we could have more values than we should put into
// a single block. We need to chunk them up into groups and re-encode them.
return k.chunk{{.Name}}(nil)
} else {
var chunked blocks
var i int
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
// If we this block is already full, just add it as is
if BlockCount(k.blocks[i].b) >= k.size {
chunked = append(chunked, k.blocks[i])
} else {
break
}
i++
// Allow other goroutines to run
runtime.Gosched()
}
if k.fast {
for i < len(k.blocks) {
// skip this block if it's values were already read
if k.blocks[i].read() {
i++
continue
}
chunked = append(chunked, k.blocks[i])
i++
// Allow other goroutines to run
runtime.Gosched()
}
}
// If we only have 1 blocks left, just append it as is and avoid decoding/recoding
if i == len(k.blocks)-1 {
if !k.blocks[i].read() {
chunked = append(chunked, k.blocks[i])
}
i++
}
// The remaining blocks can be combined and we know that they do not overlap and
// so we can just append each, sort and re-encode.
for i < len(k.blocks) && len(k.merged{{.Name}}Values) < k.size {
if k.blocks[i].read() {
i++
continue
}
v, err := Decode{{.Name}}Block(k.blocks[i].b, &[]{{.Name}}Value{})
if err != nil {
k.err = err
return nil
}
// Apply each tombstone to the block
for _, ts := range k.blocks[i].tombstones {
v = {{.Name}}Values(v).Exclude(ts.Min, ts.Max)
}
k.blocks[i].markRead(k.blocks[i].minTime, k.blocks[i].maxTime)
k.merged{{.Name}}Values = k.merged{{.Name}}Values.Merge(v)
i++
// Allow other goroutines to run
runtime.Gosched()
}
k.blocks = k.blocks[i:]
return k.chunk{{.Name}}(chunked)
}
}
func (k *tsmKeyIterator) chunk{{.Name}}(dst blocks) blocks {
if len(k.merged{{.Name}}Values) > k.size {
values := k.merged{{.Name}}Values[:k.size]
cb, err := {{.Name}}Values(values).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: values[0].UnixNano(),
maxTime: values[len(values)-1].UnixNano(),
key: k.key,
b: cb,
})
k.merged{{.Name}}Values = k.merged{{.Name}}Values[k.size:]
return dst
}
// Re-encode the remaining values into the last block
if len(k.merged{{.Name}}Values) > 0 {
cb, err := {{.Name}}Values(k.merged{{.Name}}Values).Encode(nil)
if err != nil {
k.err = err
return nil
}
dst = append(dst, &block{
minTime: k.merged{{.Name}}Values[0].UnixNano(),
maxTime: k.merged{{.Name}}Values[len(k.merged{{.Name}}Values)-1].UnixNano(),
key: k.key,
b: cb,
})
k.merged{{.Name}}Values = k.merged{{.Name}}Values[:0]
}
return dst
}
{{ end }}

View File

@@ -0,0 +1,18 @@
[
{
"Name":"Float",
"name":"float"
},
{
"Name":"Integer",
"name":"integer"
},
{
"Name":"String",
"name":"string"
},
{
"Name":"Boolean",
"name":"boolean"
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,77 @@
package tsm1
import (
"math"
"github.com/influxdata/influxdb/tsdb"
)
// multieFieldCursor wraps cursors for multiple fields on the same series
// key. Instead of returning a plain interface value in the call for Next(),
// it returns a map[string]interface{} for the field values
type multiFieldCursor struct {
fields []string
cursors []tsdb.Cursor
ascending bool
keyBuffer []int64
valueBuffer []interface{}
}
// NewMultiFieldCursor returns an instance of Cursor that joins the results of cursors.
func NewMultiFieldCursor(fields []string, cursors []tsdb.Cursor, ascending bool) tsdb.Cursor {
return &multiFieldCursor{
fields: fields,
cursors: cursors,
ascending: ascending,
keyBuffer: make([]int64, len(cursors)),
valueBuffer: make([]interface{}, len(cursors)),
}
}
func (m *multiFieldCursor) SeekTo(seek int64) (key int64, value interface{}) {
for i, c := range m.cursors {
m.keyBuffer[i], m.valueBuffer[i] = c.SeekTo(seek)
}
return m.read()
}
func (m *multiFieldCursor) Next() (int64, interface{}) {
return m.read()
}
func (m *multiFieldCursor) Ascending() bool {
return m.ascending
}
func (m *multiFieldCursor) read() (int64, interface{}) {
t := int64(math.MaxInt64)
if !m.ascending {
t = int64(math.MinInt64)
}
// find the time we need to combine all fields
for _, k := range m.keyBuffer {
if k == tsdb.EOF {
continue
}
if m.ascending && t > k {
t = k
} else if !m.ascending && t < k {
t = k
}
}
// get the value and advance each of the cursors that have the matching time
if t == math.MinInt64 || t == math.MaxInt64 {
return tsdb.EOF, nil
}
mm := make(map[string]interface{})
for i, k := range m.keyBuffer {
if k == t {
mm[m.fields[i]] = m.valueBuffer[i]
m.keyBuffer[i], m.valueBuffer[i] = m.cursors[i].Next()
}
}
return t, mm
}

View File

@@ -0,0 +1,938 @@
// Generated by tmpl
// https://github.com/benbjohnson/tmpl
//
// DO NOT EDIT!
// Source: encoding.gen.go.tmpl
package tsm1
import (
"fmt"
"sort"
)
// Values represents a slice of values.
type Values []Value
func (a Values) MinTime() int64 {
return a[0].UnixNano()
}
func (a Values) MaxTime() int64 {
return a[len(a)-1].UnixNano()
}
func (a Values) Size() int {
sz := 0
for _, v := range a {
sz += v.Size()
}
return sz
}
func (a Values) ordered() bool {
if len(a) <= 1 {
return true
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
return false
}
}
return true
}
func (a Values) assertOrdered() {
if len(a) <= 1 {
return
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
panic(fmt.Sprintf("not ordered: %d %d >= %d", i, av, ab))
}
}
}
// Deduplicate returns a new slice with any values that have the same timestamp removed.
// The Value that appears last in the slice is the one that is kept.
func (a Values) Deduplicate() Values {
if len(a) == 0 {
return a
}
// See if we're already sorted and deduped
var needSort bool
for i := 1; i < len(a); i++ {
if a[i-1].UnixNano() >= a[i].UnixNano() {
needSort = true
break
}
}
if !needSort {
return a
}
sort.Stable(a)
var i int
for j := 1; j < len(a); j++ {
v := a[j]
if v.UnixNano() != a[i].UnixNano() {
i++
}
a[i] = v
}
return a[:i+1]
}
// Exclude returns the subset of values not in [min, max]
func (a Values) Exclude(min, max int64) Values {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() >= min && a[j].UnixNano() <= max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Include returns the subset values between min and max inclusive.
func (a Values) Include(min, max int64) Values {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() < min || a[j].UnixNano() > max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Merge overlays b to top of a. If two values conflict with
// the same timestamp, b is used. Both a and b must be sorted
// in ascending order.
func (a Values) Merge(b Values) Values {
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
// Normally, both a and b should not contain duplicates. Due to a bug in older versions, it's
// possible stored blocks might contain duplicate values. Remove them if they exists before
// merging.
a = a.Deduplicate()
b = b.Deduplicate()
if a[len(a)-1].UnixNano() < b[0].UnixNano() {
return append(a, b...)
}
if b[len(b)-1].UnixNano() < a[0].UnixNano() {
return append(b, a...)
}
out := make(Values, 0, len(a)+len(b))
for len(a) > 0 && len(b) > 0 {
if a[0].UnixNano() < b[0].UnixNano() {
out, a = append(out, a[0]), a[1:]
} else if len(b) > 0 && a[0].UnixNano() == b[0].UnixNano() {
a = a[1:]
} else {
out, b = append(out, b[0]), b[1:]
}
}
if len(a) > 0 {
return append(out, a...)
}
return append(out, b...)
}
// Sort methods
func (a Values) Len() int { return len(a) }
func (a Values) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a Values) Less(i, j int) bool { return a[i].UnixNano() < a[j].UnixNano() }
// FloatValues represents a slice of Float values.
type FloatValues []FloatValue
func (a FloatValues) MinTime() int64 {
return a[0].UnixNano()
}
func (a FloatValues) MaxTime() int64 {
return a[len(a)-1].UnixNano()
}
func (a FloatValues) Size() int {
sz := 0
for _, v := range a {
sz += v.Size()
}
return sz
}
func (a FloatValues) ordered() bool {
if len(a) <= 1 {
return true
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
return false
}
}
return true
}
func (a FloatValues) assertOrdered() {
if len(a) <= 1 {
return
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
panic(fmt.Sprintf("not ordered: %d %d >= %d", i, av, ab))
}
}
}
// Deduplicate returns a new slice with any values that have the same timestamp removed.
// The Value that appears last in the slice is the one that is kept.
func (a FloatValues) Deduplicate() FloatValues {
if len(a) == 0 {
return a
}
// See if we're already sorted and deduped
var needSort bool
for i := 1; i < len(a); i++ {
if a[i-1].UnixNano() >= a[i].UnixNano() {
needSort = true
break
}
}
if !needSort {
return a
}
sort.Stable(a)
var i int
for j := 1; j < len(a); j++ {
v := a[j]
if v.UnixNano() != a[i].UnixNano() {
i++
}
a[i] = v
}
return a[:i+1]
}
// Exclude returns the subset of values not in [min, max]
func (a FloatValues) Exclude(min, max int64) FloatValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() >= min && a[j].UnixNano() <= max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Include returns the subset values between min and max inclusive.
func (a FloatValues) Include(min, max int64) FloatValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() < min || a[j].UnixNano() > max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Merge overlays b to top of a. If two values conflict with
// the same timestamp, b is used. Both a and b must be sorted
// in ascending order.
func (a FloatValues) Merge(b FloatValues) FloatValues {
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
// Normally, both a and b should not contain duplicates. Due to a bug in older versions, it's
// possible stored blocks might contain duplicate values. Remove them if they exists before
// merging.
a = a.Deduplicate()
b = b.Deduplicate()
if a[len(a)-1].UnixNano() < b[0].UnixNano() {
return append(a, b...)
}
if b[len(b)-1].UnixNano() < a[0].UnixNano() {
return append(b, a...)
}
out := make(FloatValues, 0, len(a)+len(b))
for len(a) > 0 && len(b) > 0 {
if a[0].UnixNano() < b[0].UnixNano() {
out, a = append(out, a[0]), a[1:]
} else if len(b) > 0 && a[0].UnixNano() == b[0].UnixNano() {
a = a[1:]
} else {
out, b = append(out, b[0]), b[1:]
}
}
if len(a) > 0 {
return append(out, a...)
}
return append(out, b...)
}
func (a FloatValues) Encode(buf []byte) ([]byte, error) {
return encodeFloatValuesBlock(buf, a)
}
func encodeFloatValuesBlock(buf []byte, values []FloatValue) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
venc := getFloatEncoder(len(values))
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
tsenc.Write(v.unixnano)
venc.Write(v.value)
}
venc.Flush()
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, BlockFloat64, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
putFloatEncoder(venc)
return b, err
}
// Sort methods
func (a FloatValues) Len() int { return len(a) }
func (a FloatValues) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a FloatValues) Less(i, j int) bool { return a[i].UnixNano() < a[j].UnixNano() }
// IntegerValues represents a slice of Integer values.
type IntegerValues []IntegerValue
func (a IntegerValues) MinTime() int64 {
return a[0].UnixNano()
}
func (a IntegerValues) MaxTime() int64 {
return a[len(a)-1].UnixNano()
}
func (a IntegerValues) Size() int {
sz := 0
for _, v := range a {
sz += v.Size()
}
return sz
}
func (a IntegerValues) ordered() bool {
if len(a) <= 1 {
return true
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
return false
}
}
return true
}
func (a IntegerValues) assertOrdered() {
if len(a) <= 1 {
return
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
panic(fmt.Sprintf("not ordered: %d %d >= %d", i, av, ab))
}
}
}
// Deduplicate returns a new slice with any values that have the same timestamp removed.
// The Value that appears last in the slice is the one that is kept.
func (a IntegerValues) Deduplicate() IntegerValues {
if len(a) == 0 {
return a
}
// See if we're already sorted and deduped
var needSort bool
for i := 1; i < len(a); i++ {
if a[i-1].UnixNano() >= a[i].UnixNano() {
needSort = true
break
}
}
if !needSort {
return a
}
sort.Stable(a)
var i int
for j := 1; j < len(a); j++ {
v := a[j]
if v.UnixNano() != a[i].UnixNano() {
i++
}
a[i] = v
}
return a[:i+1]
}
// Exclude returns the subset of values not in [min, max]
func (a IntegerValues) Exclude(min, max int64) IntegerValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() >= min && a[j].UnixNano() <= max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Include returns the subset values between min and max inclusive.
func (a IntegerValues) Include(min, max int64) IntegerValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() < min || a[j].UnixNano() > max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Merge overlays b to top of a. If two values conflict with
// the same timestamp, b is used. Both a and b must be sorted
// in ascending order.
func (a IntegerValues) Merge(b IntegerValues) IntegerValues {
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
// Normally, both a and b should not contain duplicates. Due to a bug in older versions, it's
// possible stored blocks might contain duplicate values. Remove them if they exists before
// merging.
a = a.Deduplicate()
b = b.Deduplicate()
if a[len(a)-1].UnixNano() < b[0].UnixNano() {
return append(a, b...)
}
if b[len(b)-1].UnixNano() < a[0].UnixNano() {
return append(b, a...)
}
out := make(IntegerValues, 0, len(a)+len(b))
for len(a) > 0 && len(b) > 0 {
if a[0].UnixNano() < b[0].UnixNano() {
out, a = append(out, a[0]), a[1:]
} else if len(b) > 0 && a[0].UnixNano() == b[0].UnixNano() {
a = a[1:]
} else {
out, b = append(out, b[0]), b[1:]
}
}
if len(a) > 0 {
return append(out, a...)
}
return append(out, b...)
}
func (a IntegerValues) Encode(buf []byte) ([]byte, error) {
return encodeIntegerValuesBlock(buf, a)
}
func encodeIntegerValuesBlock(buf []byte, values []IntegerValue) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
venc := getIntegerEncoder(len(values))
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
tsenc.Write(v.unixnano)
venc.Write(v.value)
}
venc.Flush()
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, BlockInteger, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
putIntegerEncoder(venc)
return b, err
}
// Sort methods
func (a IntegerValues) Len() int { return len(a) }
func (a IntegerValues) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a IntegerValues) Less(i, j int) bool { return a[i].UnixNano() < a[j].UnixNano() }
// StringValues represents a slice of String values.
type StringValues []StringValue
func (a StringValues) MinTime() int64 {
return a[0].UnixNano()
}
func (a StringValues) MaxTime() int64 {
return a[len(a)-1].UnixNano()
}
func (a StringValues) Size() int {
sz := 0
for _, v := range a {
sz += v.Size()
}
return sz
}
func (a StringValues) ordered() bool {
if len(a) <= 1 {
return true
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
return false
}
}
return true
}
func (a StringValues) assertOrdered() {
if len(a) <= 1 {
return
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
panic(fmt.Sprintf("not ordered: %d %d >= %d", i, av, ab))
}
}
}
// Deduplicate returns a new slice with any values that have the same timestamp removed.
// The Value that appears last in the slice is the one that is kept.
func (a StringValues) Deduplicate() StringValues {
if len(a) == 0 {
return a
}
// See if we're already sorted and deduped
var needSort bool
for i := 1; i < len(a); i++ {
if a[i-1].UnixNano() >= a[i].UnixNano() {
needSort = true
break
}
}
if !needSort {
return a
}
sort.Stable(a)
var i int
for j := 1; j < len(a); j++ {
v := a[j]
if v.UnixNano() != a[i].UnixNano() {
i++
}
a[i] = v
}
return a[:i+1]
}
// Exclude returns the subset of values not in [min, max]
func (a StringValues) Exclude(min, max int64) StringValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() >= min && a[j].UnixNano() <= max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Include returns the subset values between min and max inclusive.
func (a StringValues) Include(min, max int64) StringValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() < min || a[j].UnixNano() > max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Merge overlays b to top of a. If two values conflict with
// the same timestamp, b is used. Both a and b must be sorted
// in ascending order.
func (a StringValues) Merge(b StringValues) StringValues {
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
// Normally, both a and b should not contain duplicates. Due to a bug in older versions, it's
// possible stored blocks might contain duplicate values. Remove them if they exists before
// merging.
a = a.Deduplicate()
b = b.Deduplicate()
if a[len(a)-1].UnixNano() < b[0].UnixNano() {
return append(a, b...)
}
if b[len(b)-1].UnixNano() < a[0].UnixNano() {
return append(b, a...)
}
out := make(StringValues, 0, len(a)+len(b))
for len(a) > 0 && len(b) > 0 {
if a[0].UnixNano() < b[0].UnixNano() {
out, a = append(out, a[0]), a[1:]
} else if len(b) > 0 && a[0].UnixNano() == b[0].UnixNano() {
a = a[1:]
} else {
out, b = append(out, b[0]), b[1:]
}
}
if len(a) > 0 {
return append(out, a...)
}
return append(out, b...)
}
func (a StringValues) Encode(buf []byte) ([]byte, error) {
return encodeStringValuesBlock(buf, a)
}
func encodeStringValuesBlock(buf []byte, values []StringValue) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
venc := getStringEncoder(len(values))
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
tsenc.Write(v.unixnano)
venc.Write(v.value)
}
venc.Flush()
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, BlockString, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
putStringEncoder(venc)
return b, err
}
// Sort methods
func (a StringValues) Len() int { return len(a) }
func (a StringValues) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a StringValues) Less(i, j int) bool { return a[i].UnixNano() < a[j].UnixNano() }
// BooleanValues represents a slice of Boolean values.
type BooleanValues []BooleanValue
func (a BooleanValues) MinTime() int64 {
return a[0].UnixNano()
}
func (a BooleanValues) MaxTime() int64 {
return a[len(a)-1].UnixNano()
}
func (a BooleanValues) Size() int {
sz := 0
for _, v := range a {
sz += v.Size()
}
return sz
}
func (a BooleanValues) ordered() bool {
if len(a) <= 1 {
return true
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
return false
}
}
return true
}
func (a BooleanValues) assertOrdered() {
if len(a) <= 1 {
return
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
panic(fmt.Sprintf("not ordered: %d %d >= %d", i, av, ab))
}
}
}
// Deduplicate returns a new slice with any values that have the same timestamp removed.
// The Value that appears last in the slice is the one that is kept.
func (a BooleanValues) Deduplicate() BooleanValues {
if len(a) == 0 {
return a
}
// See if we're already sorted and deduped
var needSort bool
for i := 1; i < len(a); i++ {
if a[i-1].UnixNano() >= a[i].UnixNano() {
needSort = true
break
}
}
if !needSort {
return a
}
sort.Stable(a)
var i int
for j := 1; j < len(a); j++ {
v := a[j]
if v.UnixNano() != a[i].UnixNano() {
i++
}
a[i] = v
}
return a[:i+1]
}
// Exclude returns the subset of values not in [min, max]
func (a BooleanValues) Exclude(min, max int64) BooleanValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() >= min && a[j].UnixNano() <= max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Include returns the subset values between min and max inclusive.
func (a BooleanValues) Include(min, max int64) BooleanValues {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() < min || a[j].UnixNano() > max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Merge overlays b to top of a. If two values conflict with
// the same timestamp, b is used. Both a and b must be sorted
// in ascending order.
func (a BooleanValues) Merge(b BooleanValues) BooleanValues {
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
// Normally, both a and b should not contain duplicates. Due to a bug in older versions, it's
// possible stored blocks might contain duplicate values. Remove them if they exists before
// merging.
a = a.Deduplicate()
b = b.Deduplicate()
if a[len(a)-1].UnixNano() < b[0].UnixNano() {
return append(a, b...)
}
if b[len(b)-1].UnixNano() < a[0].UnixNano() {
return append(b, a...)
}
out := make(BooleanValues, 0, len(a)+len(b))
for len(a) > 0 && len(b) > 0 {
if a[0].UnixNano() < b[0].UnixNano() {
out, a = append(out, a[0]), a[1:]
} else if len(b) > 0 && a[0].UnixNano() == b[0].UnixNano() {
a = a[1:]
} else {
out, b = append(out, b[0]), b[1:]
}
}
if len(a) > 0 {
return append(out, a...)
}
return append(out, b...)
}
func (a BooleanValues) Encode(buf []byte) ([]byte, error) {
return encodeBooleanValuesBlock(buf, a)
}
func encodeBooleanValuesBlock(buf []byte, values []BooleanValue) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
venc := getBooleanEncoder(len(values))
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
tsenc.Write(v.unixnano)
venc.Write(v.value)
}
venc.Flush()
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, BlockBoolean, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
putBooleanEncoder(venc)
return b, err
}
// Sort methods
func (a BooleanValues) Len() int { return len(a) }
func (a BooleanValues) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a BooleanValues) Less(i, j int) bool { return a[i].UnixNano() < a[j].UnixNano() }

View File

@@ -0,0 +1,209 @@
package tsm1
import (
"fmt"
"sort"
)
{{range .}}
// {{.Name}}Values represents a slice of {{.Name}} values.
type {{.Name}}Values []{{.Name}}Value
func (a {{.Name}}Values) MinTime() int64 {
return a[0].UnixNano()
}
func (a {{.Name}}Values) MaxTime() int64 {
return a[len(a)-1].UnixNano()
}
func (a {{.Name}}Values) Size() int {
sz := 0
for _, v := range a {
sz += v.Size()
}
return sz
}
func (a {{.Name}}Values) ordered() bool {
if len(a) <= 1 {
return true
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
return false
}
}
return true
}
func (a {{.Name}}Values) assertOrdered() {
if len(a) <= 1 {
return
}
for i := 1; i < len(a); i++ {
if av, ab := a[i-1].UnixNano(), a[i].UnixNano(); av >= ab {
panic(fmt.Sprintf("not ordered: %d %d >= %d", i, av, ab))
}
}
}
// Deduplicate returns a new slice with any values that have the same timestamp removed.
// The Value that appears last in the slice is the one that is kept.
func (a {{.Name}}Values) Deduplicate() {{.Name}}Values {
if len(a) == 0 {
return a
}
// See if we're already sorted and deduped
var needSort bool
for i := 1; i < len(a); i++ {
if a[i-1].UnixNano() >= a[i].UnixNano() {
needSort = true
break
}
}
if !needSort {
return a
}
sort.Stable(a)
var i int
for j := 1; j < len(a); j++ {
v := a[j]
if v.UnixNano() != a[i].UnixNano() {
i++
}
a[i] = v
}
return a[:i+1]
}
// Exclude returns the subset of values not in [min, max]
func (a {{.Name}}Values) Exclude(min, max int64) {{.Name}}Values {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() >= min && a[j].UnixNano() <= max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Include returns the subset values between min and max inclusive.
func (a {{.Name}}Values) Include(min, max int64) {{.Name}}Values {
var i int
for j := 0; j < len(a); j++ {
if a[j].UnixNano() < min || a[j].UnixNano() > max {
continue
}
a[i] = a[j]
i++
}
return a[:i]
}
// Merge overlays b to top of a. If two values conflict with
// the same timestamp, b is used. Both a and b must be sorted
// in ascending order.
func (a {{.Name}}Values) Merge(b {{.Name}}Values) {{.Name}}Values {
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
// Normally, both a and b should not contain duplicates. Due to a bug in older versions, it's
// possible stored blocks might contain duplicate values. Remove them if they exists before
// merging.
a = a.Deduplicate()
b = b.Deduplicate()
if a[len(a)-1].UnixNano() < b[0].UnixNano() {
return append(a, b...)
}
if b[len(b)-1].UnixNano() < a[0].UnixNano() {
return append(b, a...)
}
out := make({{.Name}}Values, 0, len(a)+len(b))
for len(a) > 0 && len(b) > 0 {
if a[0].UnixNano() < b[0].UnixNano() {
out, a = append(out, a[0]), a[1:]
} else if len(b) > 0 && a[0].UnixNano() == b[0].UnixNano() {
a = a[1:]
} else {
out, b = append(out, b[0]), b[1:]
}
}
if len(a) > 0 {
return append(out, a...)
}
return append(out, b...)
}
{{ if ne .Name "" }}
func (a {{.Name}}Values) Encode(buf []byte) ([]byte, error) {
return encode{{.Name}}ValuesBlock(buf, a)
}
func encode{{ .Name }}ValuesBlock(buf []byte, values []{{.Name}}Value) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
venc := get{{ .Name }}Encoder(len(values))
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
tsenc.Write(v.unixnano)
venc.Write(v.value)
}
venc.Flush()
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, {{ .Type }}, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
put{{.Name}}Encoder(venc)
return b, err
}
{{ end }}
// Sort methods
func (a {{.Name}}Values) Len() int { return len(a) }
func (a {{.Name}}Values) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a {{.Name}}Values) Less(i, j int) bool { return a[i].UnixNano() < a[j].UnixNano() }
{{ end }}

View File

@@ -0,0 +1,27 @@
[
{
"Name":"",
"name":"",
"Type":""
},
{
"Name":"Float",
"name":"float",
"Type":"BlockFloat64"
},
{
"Name":"Integer",
"name":"integer",
"Type":"BlockInteger"
},
{
"Name":"String",
"name":"string",
"Type":"BlockString"
},
{
"Name":"Boolean",
"name":"boolean",
"Type":"BlockBoolean"
}
]

View File

@@ -0,0 +1,880 @@
package tsm1
import (
"encoding/binary"
"fmt"
"runtime"
"time"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/pkg/pool"
"github.com/influxdata/influxdb/tsdb"
)
const (
// BlockFloat64 designates a block encodes float64 values.
BlockFloat64 = byte(0)
// BlockInteger designates a block encodes int64 values.
BlockInteger = byte(1)
// BlockBoolean designates a block encodes boolean values.
BlockBoolean = byte(2)
// BlockString designates a block encodes string values.
BlockString = byte(3)
// encodedBlockHeaderSize is the size of the header for an encoded block. There is one
// byte encoding the type of the block.
encodedBlockHeaderSize = 1
)
func init() {
// Prime the pools with one encoder/decoder for each available CPU.
vals := make([]interface{}, 0, runtime.NumCPU())
for _, p := range []*pool.Generic{
timeEncoderPool, timeDecoderPool,
integerEncoderPool, integerDecoderPool,
floatDecoderPool, floatDecoderPool,
stringEncoderPool, stringEncoderPool,
booleanEncoderPool, booleanDecoderPool,
} {
vals = vals[:0]
// Check one out to force the allocation now and hold onto it
for i := 0; i < runtime.NumCPU(); i++ {
v := p.Get(tsdb.DefaultMaxPointsPerBlock)
vals = append(vals, v)
}
// Add them all back
for _, v := range vals {
p.Put(v)
}
}
}
var (
// encoder pools
timeEncoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return NewTimeEncoder(sz)
})
integerEncoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return NewIntegerEncoder(sz)
})
floatEncoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return NewFloatEncoder()
})
stringEncoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return NewStringEncoder(sz)
})
booleanEncoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return NewBooleanEncoder(sz)
})
// decoder pools
timeDecoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return &TimeDecoder{}
})
integerDecoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return &IntegerDecoder{}
})
floatDecoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return &FloatDecoder{}
})
stringDecoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return &StringDecoder{}
})
booleanDecoderPool = pool.NewGeneric(runtime.NumCPU(), func(sz int) interface{} {
return &BooleanDecoder{}
})
)
// Value represents a TSM-encoded value.
type Value interface {
// UnixNano returns the timestamp of the value in nanoseconds since unix epoch.
UnixNano() int64
// Value returns the underlying value.
Value() interface{}
// Size returns the number of bytes necessary to represent the value and its timestamp.
Size() int
// String returns the string representation of the value and its timestamp.
String() string
// internalOnly is unexported to ensure implementations of Value
// can only originate in this package.
internalOnly()
}
// NewValue returns a new Value with the underlying type dependent on value.
func NewValue(t int64, value interface{}) Value {
switch v := value.(type) {
case int64:
return IntegerValue{unixnano: t, value: v}
case float64:
return FloatValue{unixnano: t, value: v}
case bool:
return BooleanValue{unixnano: t, value: v}
case string:
return StringValue{unixnano: t, value: v}
}
return EmptyValue{}
}
// NewIntegerValue returns a new integer value.
func NewIntegerValue(t int64, v int64) Value {
return IntegerValue{unixnano: t, value: v}
}
// NewFloatValue returns a new float value.
func NewFloatValue(t int64, v float64) Value {
return FloatValue{unixnano: t, value: v}
}
// NewBooleanValue returns a new boolean value.
func NewBooleanValue(t int64, v bool) Value {
return BooleanValue{unixnano: t, value: v}
}
// NewStringValue returns a new string value.
func NewStringValue(t int64, v string) Value {
return StringValue{unixnano: t, value: v}
}
// EmptyValue is used when there is no appropriate other value.
type EmptyValue struct{}
// UnixNano returns tsdb.EOF.
func (e EmptyValue) UnixNano() int64 { return tsdb.EOF }
// Value returns nil.
func (e EmptyValue) Value() interface{} { return nil }
// Size returns 0.
func (e EmptyValue) Size() int { return 0 }
// String returns the empty string.
func (e EmptyValue) String() string { return "" }
func (_ EmptyValue) internalOnly() {}
func (_ StringValue) internalOnly() {}
func (_ IntegerValue) internalOnly() {}
func (_ BooleanValue) internalOnly() {}
func (_ FloatValue) internalOnly() {}
// Encode converts the values to a byte slice. If there are no values,
// this function panics.
func (a Values) Encode(buf []byte) ([]byte, error) {
if len(a) == 0 {
panic("unable to encode block type")
}
switch a[0].(type) {
case FloatValue:
return encodeFloatBlock(buf, a)
case IntegerValue:
return encodeIntegerBlock(buf, a)
case BooleanValue:
return encodeBooleanBlock(buf, a)
case StringValue:
return encodeStringBlock(buf, a)
}
return nil, fmt.Errorf("unsupported value type %T", a[0])
}
// InfluxQLType returns the influxql.DataType the values map to.
func (a Values) InfluxQLType() (influxql.DataType, error) {
if len(a) == 0 {
return influxql.Unknown, fmt.Errorf("no values to infer type")
}
switch a[0].(type) {
case FloatValue:
return influxql.Float, nil
case IntegerValue:
return influxql.Integer, nil
case BooleanValue:
return influxql.Boolean, nil
case StringValue:
return influxql.String, nil
}
return influxql.Unknown, fmt.Errorf("unsupported value type %T", a[0])
}
// BlockType returns the type of value encoded in a block or an error
// if the block type is unknown.
func BlockType(block []byte) (byte, error) {
blockType := block[0]
switch blockType {
case BlockFloat64, BlockInteger, BlockBoolean, BlockString:
return blockType, nil
default:
return 0, fmt.Errorf("unknown block type: %d", blockType)
}
}
// BlockCount returns the number of timestamps encoded in block.
func BlockCount(block []byte) int {
if len(block) <= encodedBlockHeaderSize {
panic(fmt.Sprintf("count of short block: got %v, exp %v", len(block), encodedBlockHeaderSize))
}
// first byte is the block type
tb, _, err := unpackBlock(block[1:])
if err != nil {
panic(fmt.Sprintf("BlockCount: error unpacking block: %s", err.Error()))
}
return CountTimestamps(tb)
}
// DecodeBlock takes a byte slice and decodes it into values of the appropriate type
// based on the block.
func DecodeBlock(block []byte, vals []Value) ([]Value, error) {
if len(block) <= encodedBlockHeaderSize {
panic(fmt.Sprintf("decode of short block: got %v, exp %v", len(block), encodedBlockHeaderSize))
}
blockType, err := BlockType(block)
if err != nil {
return nil, err
}
switch blockType {
case BlockFloat64:
var buf []FloatValue
decoded, err := DecodeFloatBlock(block, &buf)
if len(vals) < len(decoded) {
vals = make([]Value, len(decoded))
}
for i := range decoded {
vals[i] = decoded[i]
}
return vals[:len(decoded)], err
case BlockInteger:
var buf []IntegerValue
decoded, err := DecodeIntegerBlock(block, &buf)
if len(vals) < len(decoded) {
vals = make([]Value, len(decoded))
}
for i := range decoded {
vals[i] = decoded[i]
}
return vals[:len(decoded)], err
case BlockBoolean:
var buf []BooleanValue
decoded, err := DecodeBooleanBlock(block, &buf)
if len(vals) < len(decoded) {
vals = make([]Value, len(decoded))
}
for i := range decoded {
vals[i] = decoded[i]
}
return vals[:len(decoded)], err
case BlockString:
var buf []StringValue
decoded, err := DecodeStringBlock(block, &buf)
if len(vals) < len(decoded) {
vals = make([]Value, len(decoded))
}
for i := range decoded {
vals[i] = decoded[i]
}
return vals[:len(decoded)], err
default:
panic(fmt.Sprintf("unknown block type: %d", blockType))
}
}
// FloatValue represents a float64 value.
type FloatValue struct {
unixnano int64
value float64
}
// UnixNano returns the timestamp of the value.
func (v FloatValue) UnixNano() int64 {
return v.unixnano
}
// Value returns the underlying float64 value.
func (v FloatValue) Value() interface{} {
return v.value
}
// Size returns the number of bytes necessary to represent the value and its timestamp.
func (v FloatValue) Size() int {
return 16
}
// String returns the string representation of the value and its timestamp.
func (v FloatValue) String() string {
return fmt.Sprintf("%v %v", time.Unix(0, v.unixnano), v.value)
}
func encodeFloatBlock(buf []byte, values []Value) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
// A float block is encoded using different compression strategies
// for timestamps and values.
// Encode values using Gorilla float compression
venc := getFloatEncoder(len(values))
// Encode timestamps using an adaptive encoder that uses delta-encoding,
// frame-or-reference and run length encoding.
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
vv := v.(FloatValue)
tsenc.Write(vv.unixnano)
venc.Write(vv.value)
}
venc.Flush()
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded float values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, BlockFloat64, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
putFloatEncoder(venc)
return b, err
}
// DecodeFloatBlock decodes the float block from the byte slice
// and appends the float values to a.
func DecodeFloatBlock(block []byte, a *[]FloatValue) ([]FloatValue, error) {
// Block type is the next block, make sure we actually have a float block
blockType := block[0]
if blockType != BlockFloat64 {
return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockFloat64, blockType)
}
block = block[1:]
tb, vb, err := unpackBlock(block)
if err != nil {
return nil, err
}
tdec := timeDecoderPool.Get(0).(*TimeDecoder)
vdec := floatDecoderPool.Get(0).(*FloatDecoder)
var i int
err = func() error {
// Setup our timestamp and value decoders
tdec.Init(tb)
err = vdec.SetBytes(vb)
if err != nil {
return err
}
// Decode both a timestamp and value
for tdec.Next() && vdec.Next() {
ts := tdec.Read()
v := vdec.Values()
if i < len(*a) {
elem := &(*a)[i]
elem.unixnano = ts
elem.value = v
} else {
*a = append(*a, FloatValue{ts, v})
}
i++
}
// Did timestamp decoding have an error?
err = tdec.Error()
if err != nil {
return err
}
// Did float decoding have an error?
err = vdec.Error()
if err != nil {
return err
}
return nil
}()
timeDecoderPool.Put(tdec)
floatDecoderPool.Put(vdec)
return (*a)[:i], err
}
// BooleanValue represents a boolean value.
type BooleanValue struct {
unixnano int64
value bool
}
// Size returns the number of bytes necessary to represent the value and its timestamp.
func (v BooleanValue) Size() int {
return 9
}
// UnixNano returns the timestamp of the value in nanoseconds since unix epoch.
func (v BooleanValue) UnixNano() int64 {
return v.unixnano
}
// Value returns the underlying boolean value.
func (v BooleanValue) Value() interface{} {
return v.value
}
// String returns the string representation of the value and its timestamp.
func (v BooleanValue) String() string {
return fmt.Sprintf("%v %v", time.Unix(0, v.unixnano), v.Value())
}
func encodeBooleanBlock(buf []byte, values []Value) ([]byte, error) {
if len(values) == 0 {
return nil, nil
}
// A boolean block is encoded using different compression strategies
// for timestamps and values.
venc := getBooleanEncoder(len(values))
// Encode timestamps using an adaptive encoder
tsenc := getTimeEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
vv := v.(BooleanValue)
tsenc.Write(vv.unixnano)
venc.Write(vv.value)
}
// Encoded timestamp values
tb, err := tsenc.Bytes()
if err != nil {
return err
}
// Encoded float values
vb, err := venc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes and the block
// in the next byte, followed by the block
b = packBlock(buf, BlockBoolean, tb, vb)
return nil
}()
putTimeEncoder(tsenc)
putBooleanEncoder(venc)
return b, err
}
// DecodeBooleanBlock decodes the boolean block from the byte slice
// and appends the boolean values to a.
func DecodeBooleanBlock(block []byte, a *[]BooleanValue) ([]BooleanValue, error) {
// Block type is the next block, make sure we actually have a float block
blockType := block[0]
if blockType != BlockBoolean {
return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockBoolean, blockType)
}
block = block[1:]
tb, vb, err := unpackBlock(block)
if err != nil {
return nil, err
}
tdec := timeDecoderPool.Get(0).(*TimeDecoder)
vdec := booleanDecoderPool.Get(0).(*BooleanDecoder)
var i int
err = func() error {
// Setup our timestamp and value decoders
tdec.Init(tb)
vdec.SetBytes(vb)
// Decode both a timestamp and value
for tdec.Next() && vdec.Next() {
ts := tdec.Read()
v := vdec.Read()
if i < len(*a) {
elem := &(*a)[i]
elem.unixnano = ts
elem.value = v
} else {
*a = append(*a, BooleanValue{ts, v})
}
i++
}
// Did timestamp decoding have an error?
err = tdec.Error()
if err != nil {
return err
}
// Did boolean decoding have an error?
err = vdec.Error()
if err != nil {
return err
}
return nil
}()
timeDecoderPool.Put(tdec)
booleanDecoderPool.Put(vdec)
return (*a)[:i], err
}
// FloatValue represents an int64 value.
type IntegerValue struct {
unixnano int64
value int64
}
// Value returns the underlying int64 value.
func (v IntegerValue) Value() interface{} {
return v.value
}
// UnixNano returns the timestamp of the value.
func (v IntegerValue) UnixNano() int64 {
return v.unixnano
}
// Size returns the number of bytes necessary to represent the value and its timestamp.
func (v IntegerValue) Size() int {
return 16
}
// String returns the string representation of the value and its timestamp.
func (v IntegerValue) String() string {
return fmt.Sprintf("%v %v", time.Unix(0, v.unixnano), v.Value())
}
func encodeIntegerBlock(buf []byte, values []Value) ([]byte, error) {
tsEnc := getTimeEncoder(len(values))
vEnc := getIntegerEncoder(len(values))
var b []byte
err := func() error {
for _, v := range values {
vv := v.(IntegerValue)
tsEnc.Write(vv.unixnano)
vEnc.Write(vv.value)
}
// Encoded timestamp values
tb, err := tsEnc.Bytes()
if err != nil {
return err
}
// Encoded int64 values
vb, err := vEnc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes
b = packBlock(buf, BlockInteger, tb, vb)
return nil
}()
putTimeEncoder(tsEnc)
putIntegerEncoder(vEnc)
return b, err
}
// DecodeIntegerBlock decodes the integer block from the byte slice
// and appends the integer values to a.
func DecodeIntegerBlock(block []byte, a *[]IntegerValue) ([]IntegerValue, error) {
blockType := block[0]
if blockType != BlockInteger {
return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockInteger, blockType)
}
block = block[1:]
// The first 8 bytes is the minimum timestamp of the block
tb, vb, err := unpackBlock(block)
if err != nil {
return nil, err
}
tdec := timeDecoderPool.Get(0).(*TimeDecoder)
vdec := integerDecoderPool.Get(0).(*IntegerDecoder)
var i int
err = func() error {
// Setup our timestamp and value decoders
tdec.Init(tb)
vdec.SetBytes(vb)
// Decode both a timestamp and value
for tdec.Next() && vdec.Next() {
ts := tdec.Read()
v := vdec.Read()
if i < len(*a) {
elem := &(*a)[i]
elem.unixnano = ts
elem.value = v
} else {
*a = append(*a, IntegerValue{ts, v})
}
i++
}
// Did timestamp decoding have an error?
err = tdec.Error()
if err != nil {
return err
}
// Did int64 decoding have an error?
err = vdec.Error()
if err != nil {
return err
}
return nil
}()
timeDecoderPool.Put(tdec)
integerDecoderPool.Put(vdec)
return (*a)[:i], err
}
// StringValue represents a string value.
type StringValue struct {
unixnano int64
value string
}
// Value returns the underlying string value.
func (v StringValue) Value() interface{} {
return v.value
}
// UnixNano returns the timestamp of the value.
func (v StringValue) UnixNano() int64 {
return v.unixnano
}
// Size returns the number of bytes necessary to represent the value and its timestamp.
func (v StringValue) Size() int {
return 8 + len(v.value)
}
// String returns the string representation of the value and its timestamp.
func (v StringValue) String() string {
return fmt.Sprintf("%v %v", time.Unix(0, v.unixnano), v.Value())
}
func encodeStringBlock(buf []byte, values []Value) ([]byte, error) {
tsEnc := getTimeEncoder(len(values))
vEnc := getStringEncoder(len(values) * len(values[0].(StringValue).value))
var b []byte
err := func() error {
for _, v := range values {
vv := v.(StringValue)
tsEnc.Write(vv.unixnano)
vEnc.Write(vv.value)
}
// Encoded timestamp values
tb, err := tsEnc.Bytes()
if err != nil {
return err
}
// Encoded string values
vb, err := vEnc.Bytes()
if err != nil {
return err
}
// Prepend the first timestamp of the block in the first 8 bytes
b = packBlock(buf, BlockString, tb, vb)
return nil
}()
putTimeEncoder(tsEnc)
putStringEncoder(vEnc)
return b, err
}
// DecodeStringBlock decodes the string block from the byte slice
// and appends the string values to a.
func DecodeStringBlock(block []byte, a *[]StringValue) ([]StringValue, error) {
blockType := block[0]
if blockType != BlockString {
return nil, fmt.Errorf("invalid block type: exp %d, got %d", BlockString, blockType)
}
block = block[1:]
// The first 8 bytes is the minimum timestamp of the block
tb, vb, err := unpackBlock(block)
if err != nil {
return nil, err
}
tdec := timeDecoderPool.Get(0).(*TimeDecoder)
vdec := stringDecoderPool.Get(0).(*StringDecoder)
var i int
err = func() error {
// Setup our timestamp and value decoders
tdec.Init(tb)
err = vdec.SetBytes(vb)
if err != nil {
return err
}
// Decode both a timestamp and value
for tdec.Next() && vdec.Next() {
ts := tdec.Read()
v := vdec.Read()
if i < len(*a) {
elem := &(*a)[i]
elem.unixnano = ts
elem.value = v
} else {
*a = append(*a, StringValue{ts, v})
}
i++
}
// Did timestamp decoding have an error?
err = tdec.Error()
if err != nil {
return err
}
// Did string decoding have an error?
err = vdec.Error()
if err != nil {
return err
}
return nil
}()
timeDecoderPool.Put(tdec)
stringDecoderPool.Put(vdec)
return (*a)[:i], err
}
func packBlock(buf []byte, typ byte, ts []byte, values []byte) []byte {
// We encode the length of the timestamp block using a variable byte encoding.
// This allows small byte slices to take up 1 byte while larger ones use 2 or more.
sz := 1 + binary.MaxVarintLen64 + len(ts) + len(values)
if cap(buf) < sz {
buf = make([]byte, sz)
}
b := buf[:sz]
b[0] = typ
i := binary.PutUvarint(b[1:1+binary.MaxVarintLen64], uint64(len(ts)))
i += 1
// block is <len timestamp bytes>, <ts bytes>, <value bytes>
copy(b[i:], ts)
// We don't encode the value length because we know it's the rest of the block after
// the timestamp block.
copy(b[i+len(ts):], values)
return b[:i+len(ts)+len(values)]
}
func unpackBlock(buf []byte) (ts, values []byte, err error) {
// Unpack the timestamp block length
tsLen, i := binary.Uvarint(buf)
if i <= 0 {
err = fmt.Errorf("unpackBlock: unable to read timestamp block length")
return
}
// Unpack the timestamp bytes
tsIdx := int(i) + int(tsLen)
if tsIdx > len(buf) {
err = fmt.Errorf("unpackBlock: not enough data for timestamp")
return
}
ts = buf[int(i):tsIdx]
// Unpack the value bytes
values = buf[tsIdx:]
return
}
// ZigZagEncode converts a int64 to a uint64 by zig zagging negative and positive values
// across even and odd numbers. Eg. [0,-1,1,-2] becomes [0, 1, 2, 3].
func ZigZagEncode(x int64) uint64 {
return uint64(uint64(x<<1) ^ uint64((int64(x) >> 63)))
}
// ZigZagDecode converts a previously zigzag encoded uint64 back to a int64.
func ZigZagDecode(v uint64) int64 {
return int64((v >> 1) ^ uint64((int64(v&1)<<63)>>63))
}
func getTimeEncoder(sz int) TimeEncoder {
x := timeEncoderPool.Get(sz).(TimeEncoder)
x.Reset()
return x
}
func putTimeEncoder(enc TimeEncoder) { timeEncoderPool.Put(enc) }
func getIntegerEncoder(sz int) IntegerEncoder {
x := integerEncoderPool.Get(sz).(IntegerEncoder)
x.Reset()
return x
}
func putIntegerEncoder(enc IntegerEncoder) { integerEncoderPool.Put(enc) }
func getFloatEncoder(sz int) *FloatEncoder {
x := floatEncoderPool.Get(sz).(*FloatEncoder)
x.Reset()
return x
}
func putFloatEncoder(enc *FloatEncoder) { floatEncoderPool.Put(enc) }
func getStringEncoder(sz int) StringEncoder {
x := stringEncoderPool.Get(sz).(StringEncoder)
x.Reset()
return x
}
func putStringEncoder(enc StringEncoder) { stringEncoderPool.Put(enc) }
func getBooleanEncoder(sz int) BooleanEncoder {
x := booleanEncoderPool.Get(sz).(BooleanEncoder)
x.Reset()
return x
}
func putBooleanEncoder(enc BooleanEncoder) { booleanEncoderPool.Put(enc) }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,659 @@
// Generated by tmpl
// https://github.com/benbjohnson/tmpl
//
// DO NOT EDIT!
// Source: file_store.gen.go.tmpl
package tsm1
// ReadFloatBlock reads the next block as a set of float values.
func (c *KeyCursor) ReadFloatBlock(buf *[]FloatValue) ([]FloatValue, error) {
// No matching blocks to decode
if len(c.current) == 0 {
return nil, nil
}
// First block is the oldest block containing the points we're searching for.
first := c.current[0]
*buf = (*buf)[:0]
values, err := first.r.ReadFloatBlockAt(&first.entry, buf)
if err != nil {
return nil, err
}
// Remove values we already read
values = FloatValues(values).Exclude(first.readMin, first.readMax)
// Remove any tombstones
tombstones := first.r.TombstoneRange(c.key)
values = c.filterFloatValues(tombstones, values)
// Check we have remaining values.
if len(values) == 0 {
return nil, nil
}
// Only one block with this key and time range so return it
if len(c.current) == 1 {
if len(values) > 0 {
first.markRead(values[0].UnixNano(), values[len(values)-1].UnixNano())
}
return values, nil
}
// Use the current block time range as our overlapping window
minT, maxT := first.readMin, first.readMax
if len(values) > 0 {
minT, maxT = values[0].UnixNano(), values[len(values)-1].UnixNano()
}
if c.ascending {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the min time range to ensure values are returned in ascending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MinTime < minT && !cur.read() {
minT = cur.entry.MinTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MaxTime > maxT {
maxT = cur.entry.MaxTime
}
values = FloatValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []FloatValue
v, err := cur.r.ReadFloatBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterFloatValues(tombstones, v)
// Remove values we already read
v = FloatValues(v).Exclude(cur.readMin, cur.readMax)
if len(v) > 0 {
// Only use values in the overlapping window
v = FloatValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = FloatValues(values).Merge(v)
}
cur.markRead(minT, maxT)
}
} else {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the max time range to ensure values are returned in descending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MaxTime > maxT && !cur.read() {
maxT = cur.entry.MaxTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MinTime < minT {
minT = cur.entry.MinTime
}
values = FloatValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []FloatValue
v, err := cur.r.ReadFloatBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterFloatValues(tombstones, v)
// Remove values we already read
v = FloatValues(v).Exclude(cur.readMin, cur.readMax)
// If the block we decoded should have all of it's values included, mark it as read so we
// don't use it again.
if len(v) > 0 {
v = FloatValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = FloatValues(v).Merge(values)
}
cur.markRead(minT, maxT)
}
}
first.markRead(minT, maxT)
return values, err
}
// ReadIntegerBlock reads the next block as a set of integer values.
func (c *KeyCursor) ReadIntegerBlock(buf *[]IntegerValue) ([]IntegerValue, error) {
// No matching blocks to decode
if len(c.current) == 0 {
return nil, nil
}
// First block is the oldest block containing the points we're searching for.
first := c.current[0]
*buf = (*buf)[:0]
values, err := first.r.ReadIntegerBlockAt(&first.entry, buf)
if err != nil {
return nil, err
}
// Remove values we already read
values = IntegerValues(values).Exclude(first.readMin, first.readMax)
// Remove any tombstones
tombstones := first.r.TombstoneRange(c.key)
values = c.filterIntegerValues(tombstones, values)
// Check we have remaining values.
if len(values) == 0 {
return nil, nil
}
// Only one block with this key and time range so return it
if len(c.current) == 1 {
if len(values) > 0 {
first.markRead(values[0].UnixNano(), values[len(values)-1].UnixNano())
}
return values, nil
}
// Use the current block time range as our overlapping window
minT, maxT := first.readMin, first.readMax
if len(values) > 0 {
minT, maxT = values[0].UnixNano(), values[len(values)-1].UnixNano()
}
if c.ascending {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the min time range to ensure values are returned in ascending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MinTime < minT && !cur.read() {
minT = cur.entry.MinTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MaxTime > maxT {
maxT = cur.entry.MaxTime
}
values = IntegerValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []IntegerValue
v, err := cur.r.ReadIntegerBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterIntegerValues(tombstones, v)
// Remove values we already read
v = IntegerValues(v).Exclude(cur.readMin, cur.readMax)
if len(v) > 0 {
// Only use values in the overlapping window
v = IntegerValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = IntegerValues(values).Merge(v)
}
cur.markRead(minT, maxT)
}
} else {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the max time range to ensure values are returned in descending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MaxTime > maxT && !cur.read() {
maxT = cur.entry.MaxTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MinTime < minT {
minT = cur.entry.MinTime
}
values = IntegerValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []IntegerValue
v, err := cur.r.ReadIntegerBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterIntegerValues(tombstones, v)
// Remove values we already read
v = IntegerValues(v).Exclude(cur.readMin, cur.readMax)
// If the block we decoded should have all of it's values included, mark it as read so we
// don't use it again.
if len(v) > 0 {
v = IntegerValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = IntegerValues(v).Merge(values)
}
cur.markRead(minT, maxT)
}
}
first.markRead(minT, maxT)
return values, err
}
// ReadStringBlock reads the next block as a set of string values.
func (c *KeyCursor) ReadStringBlock(buf *[]StringValue) ([]StringValue, error) {
// No matching blocks to decode
if len(c.current) == 0 {
return nil, nil
}
// First block is the oldest block containing the points we're searching for.
first := c.current[0]
*buf = (*buf)[:0]
values, err := first.r.ReadStringBlockAt(&first.entry, buf)
if err != nil {
return nil, err
}
// Remove values we already read
values = StringValues(values).Exclude(first.readMin, first.readMax)
// Remove any tombstones
tombstones := first.r.TombstoneRange(c.key)
values = c.filterStringValues(tombstones, values)
// Check we have remaining values.
if len(values) == 0 {
return nil, nil
}
// Only one block with this key and time range so return it
if len(c.current) == 1 {
if len(values) > 0 {
first.markRead(values[0].UnixNano(), values[len(values)-1].UnixNano())
}
return values, nil
}
// Use the current block time range as our overlapping window
minT, maxT := first.readMin, first.readMax
if len(values) > 0 {
minT, maxT = values[0].UnixNano(), values[len(values)-1].UnixNano()
}
if c.ascending {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the min time range to ensure values are returned in ascending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MinTime < minT && !cur.read() {
minT = cur.entry.MinTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MaxTime > maxT {
maxT = cur.entry.MaxTime
}
values = StringValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []StringValue
v, err := cur.r.ReadStringBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterStringValues(tombstones, v)
// Remove values we already read
v = StringValues(v).Exclude(cur.readMin, cur.readMax)
if len(v) > 0 {
// Only use values in the overlapping window
v = StringValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = StringValues(values).Merge(v)
}
cur.markRead(minT, maxT)
}
} else {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the max time range to ensure values are returned in descending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MaxTime > maxT && !cur.read() {
maxT = cur.entry.MaxTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MinTime < minT {
minT = cur.entry.MinTime
}
values = StringValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []StringValue
v, err := cur.r.ReadStringBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterStringValues(tombstones, v)
// Remove values we already read
v = StringValues(v).Exclude(cur.readMin, cur.readMax)
// If the block we decoded should have all of it's values included, mark it as read so we
// don't use it again.
if len(v) > 0 {
v = StringValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = StringValues(v).Merge(values)
}
cur.markRead(minT, maxT)
}
}
first.markRead(minT, maxT)
return values, err
}
// ReadBooleanBlock reads the next block as a set of boolean values.
func (c *KeyCursor) ReadBooleanBlock(buf *[]BooleanValue) ([]BooleanValue, error) {
// No matching blocks to decode
if len(c.current) == 0 {
return nil, nil
}
// First block is the oldest block containing the points we're searching for.
first := c.current[0]
*buf = (*buf)[:0]
values, err := first.r.ReadBooleanBlockAt(&first.entry, buf)
if err != nil {
return nil, err
}
// Remove values we already read
values = BooleanValues(values).Exclude(first.readMin, first.readMax)
// Remove any tombstones
tombstones := first.r.TombstoneRange(c.key)
values = c.filterBooleanValues(tombstones, values)
// Check we have remaining values.
if len(values) == 0 {
return nil, nil
}
// Only one block with this key and time range so return it
if len(c.current) == 1 {
if len(values) > 0 {
first.markRead(values[0].UnixNano(), values[len(values)-1].UnixNano())
}
return values, nil
}
// Use the current block time range as our overlapping window
minT, maxT := first.readMin, first.readMax
if len(values) > 0 {
minT, maxT = values[0].UnixNano(), values[len(values)-1].UnixNano()
}
if c.ascending {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the min time range to ensure values are returned in ascending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MinTime < minT && !cur.read() {
minT = cur.entry.MinTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MaxTime > maxT {
maxT = cur.entry.MaxTime
}
values = BooleanValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []BooleanValue
v, err := cur.r.ReadBooleanBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterBooleanValues(tombstones, v)
// Remove values we already read
v = BooleanValues(v).Exclude(cur.readMin, cur.readMax)
if len(v) > 0 {
// Only use values in the overlapping window
v = BooleanValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = BooleanValues(values).Merge(v)
}
cur.markRead(minT, maxT)
}
} else {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the max time range to ensure values are returned in descending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MaxTime > maxT && !cur.read() {
maxT = cur.entry.MaxTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MinTime < minT {
minT = cur.entry.MinTime
}
values = BooleanValues(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []BooleanValue
v, err := cur.r.ReadBooleanBlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filterBooleanValues(tombstones, v)
// Remove values we already read
v = BooleanValues(v).Exclude(cur.readMin, cur.readMax)
// If the block we decoded should have all of it's values included, mark it as read so we
// don't use it again.
if len(v) > 0 {
v = BooleanValues(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = BooleanValues(v).Merge(values)
}
cur.markRead(minT, maxT)
}
}
first.markRead(minT, maxT)
return values, err
}

View File

@@ -0,0 +1,168 @@
package tsm1
{{range .}}
// Read{{.Name}}Block reads the next block as a set of {{.name}} values.
func (c *KeyCursor) Read{{.Name}}Block(buf *[]{{.Name}}Value) ([]{{.Name}}Value, error) {
// No matching blocks to decode
if len(c.current) == 0 {
return nil, nil
}
// First block is the oldest block containing the points we're searching for.
first := c.current[0]
*buf = (*buf)[:0]
values, err := first.r.Read{{.Name}}BlockAt(&first.entry, buf)
if err != nil {
return nil, err
}
// Remove values we already read
values = {{.Name}}Values(values).Exclude(first.readMin, first.readMax)
// Remove any tombstones
tombstones := first.r.TombstoneRange(c.key)
values = c.filter{{.Name}}Values(tombstones, values)
// Check we have remaining values.
if len(values) == 0 {
return nil, nil
}
// Only one block with this key and time range so return it
if len(c.current) == 1 {
if len(values) > 0 {
first.markRead(values[0].UnixNano(), values[len(values)-1].UnixNano())
}
return values, nil
}
// Use the current block time range as our overlapping window
minT, maxT := first.readMin, first.readMax
if len(values) > 0 {
minT, maxT = values[0].UnixNano(), values[len(values)-1].UnixNano()
}
if c.ascending {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the min time range to ensure values are returned in ascending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MinTime < minT && !cur.read() {
minT = cur.entry.MinTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MaxTime > maxT {
maxT = cur.entry.MaxTime
}
values = {{.Name}}Values(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []{{.Name}}Value
v, err := cur.r.Read{{.Name}}BlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filter{{.Name}}Values(tombstones, v)
// Remove values we already read
v = {{.Name}}Values(v).Exclude(cur.readMin, cur.readMax)
if len(v) > 0 {
// Only use values in the overlapping window
v = {{.Name}}Values(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = {{.Name}}Values(values).Merge(v)
}
cur.markRead(minT, maxT)
}
} else {
// Blocks are ordered by generation, we may have values in the past in later blocks, if so,
// expand the window to include the max time range to ensure values are returned in descending
// order
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.MaxTime > maxT && !cur.read() {
maxT = cur.entry.MaxTime
}
}
// Find first block that overlaps our window
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
if cur.entry.OverlapsTimeRange(minT, maxT) && !cur.read() {
// Shrink our window so it's the intersection of the first overlapping block and the
// first block. We do this to minimize the region that overlaps and needs to
// be merged.
if cur.entry.MinTime < minT {
minT = cur.entry.MinTime
}
values = {{.Name}}Values(values).Include(minT, maxT)
break
}
}
// Search the remaining blocks that overlap our window and append their values so we can
// merge them.
for i := 1; i < len(c.current); i++ {
cur := c.current[i]
// Skip this block if it doesn't contain points we looking for or they have already been read
if !cur.entry.OverlapsTimeRange(minT, maxT) || cur.read() {
cur.markRead(minT, maxT)
continue
}
tombstones := cur.r.TombstoneRange(c.key)
var a []{{.Name}}Value
v, err := cur.r.Read{{.Name}}BlockAt(&cur.entry, &a)
if err != nil {
return nil, err
}
// Remove any tombstoned values
v = c.filter{{.Name}}Values(tombstones, v)
// Remove values we already read
v = {{.Name}}Values(v).Exclude(cur.readMin, cur.readMax)
// If the block we decoded should have all of it's values included, mark it as read so we
// don't use it again.
if len(v) > 0 {
v = {{.Name}}Values(v).Include(minT, maxT)
// Merge the remaing values with the existing
values = {{.Name}}Values(v).Merge(values)
}
cur.markRead(minT, maxT)
}
}
first.markRead(minT, maxT)
return values, err
}
{{ end }}

View File

@@ -0,0 +1,18 @@
[
{
"Name":"Float",
"name":"float"
},
{
"Name":"Integer",
"name":"integer"
},
{
"Name":"String",
"name":"string"
},
{
"Name":"Boolean",
"name":"boolean"
}
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,249 @@
package tsm1
import (
"bytes"
"fmt"
"testing"
)
func TestMergeSeriesKey_Single(t *testing.T) {
a := make(chan seriesKey, 5)
for i := 0; i < cap(a); i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i))}
}
merged := merge(a)
close(a)
exp := []string{"0", "1", "2", "3", "4"}
for v := range merged {
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}
func TestMergeSeriesKey_Nil(t *testing.T) {
merged := merge(nil)
for v := range merged {
t.Fatalf("value mismatch: got %v, exp nil", v)
}
merged = merge(nil, nil)
for v := range merged {
t.Fatalf("value mismatch: got %v, exp nil", v)
}
}
func TestMergeSeriesKey_Duplicates(t *testing.T) {
a := make(chan seriesKey, 5)
b := make(chan seriesKey, 5)
for i := 0; i < cap(a); i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i))}
b <- seriesKey{key: []byte(fmt.Sprintf("%d", i))}
}
merged := merge(a, b)
close(a)
close(b)
exp := []string{"0", "1", "2", "3", "4"}
for v := range merged {
if len(exp) == 0 {
t.Fatalf("more values than expected: got %v", v)
}
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}
func TestMergeSeriesKey_Alternating(t *testing.T) {
a := make(chan seriesKey, 2)
b := make(chan seriesKey, 2)
for i := 0; i < cap(a); i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2))}
b <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2+1))}
}
merged := merge(a, b)
close(a)
close(b)
exp := []string{"0", "1", "2", "3"}
for v := range merged {
if len(exp) == 0 {
t.Fatalf("more values than expected: got %v", v)
}
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", string(got.key), exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}
func TestMergeSeriesKey_AlternatingDuplicates(t *testing.T) {
a := make(chan seriesKey, 2)
b := make(chan seriesKey, 2)
c := make(chan seriesKey, 2)
for i := 0; i < cap(a); i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2))}
b <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2+1))}
c <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2))}
}
merged := merge(a, b, c)
close(a)
close(b)
close(c)
exp := []string{"0", "1", "2", "3"}
for v := range merged {
if len(exp) == 0 {
t.Fatalf("more values than expected: got %v", v)
}
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", string(got.key), exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}
func TestMergeSeriesKey_Unbuffered(t *testing.T) {
a := make(chan seriesKey)
b := make(chan seriesKey)
go func() {
for i := 0; i < 2; i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2))}
}
close(a)
}()
go func() {
for i := 0; i < 2; i++ {
b <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2+1))}
}
close(b)
}()
merged := merge(a, b)
exp := []string{"0", "1", "2", "3"}
for v := range merged {
if len(exp) == 0 {
t.Fatalf("more values than expected: got %v", v)
}
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", string(got.key), exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}
func TestMergeSeriesKey_OneEmpty(t *testing.T) {
a := make(chan seriesKey)
b := make(chan seriesKey)
go func() {
for i := 0; i < 2; i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i*2))}
}
close(a)
}()
close(b)
merged := merge(a, b)
exp := []string{"0", "2"}
for v := range merged {
if len(exp) == 0 {
t.Fatalf("more values than expected: got %v", v)
}
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}
func TestMergeSeriesKey_Overlapping(t *testing.T) {
a := make(chan seriesKey)
b := make(chan seriesKey)
c := make(chan seriesKey)
go func() {
for i := 0; i < 3; i++ {
a <- seriesKey{key: []byte(fmt.Sprintf("%d", i))}
}
close(a)
}()
go func() {
for i := 4; i < 7; i++ {
b <- seriesKey{key: []byte(fmt.Sprintf("%d", i))}
}
close(b)
}()
go func() {
for i := 0; i < 9; i++ {
c <- seriesKey{key: []byte(fmt.Sprintf("%d", i))}
}
close(c)
}()
merged := merge(a, b, c)
exp := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8"}
for v := range merged {
if len(exp) == 0 {
t.Fatalf("more values than expected: got %v", v)
}
if got, exp := v, exp[0]; !bytes.Equal(got.key, []byte(exp)) {
t.Fatalf("value mismatch: got %v, exp %v", string(got.key), exp)
}
exp = exp[1:]
}
if len(exp) > 0 {
t.Fatalf("missed values: %v", exp)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
// +build !windows
package tsm1
import "os"
func syncDir(dirName string) error {
// fsync the dir to flush the rename
dir, err := os.OpenFile(dirName, os.O_RDONLY, os.ModeDir)
if err != nil {
return err
}
defer dir.Close()
return dir.Sync()
}
// renameFile will rename the source to target using os function.
func renameFile(oldpath, newpath string) error {
return os.Rename(oldpath, newpath)
}

View File

@@ -0,0 +1,18 @@
package tsm1
import "os"
func syncDir(dirName string) error {
return nil
}
// renameFile will rename the source to target using os function. If target exists it will be removed before renaming.
func renameFile(oldpath, newpath string) error {
if _, err := os.Stat(newpath); err == nil {
if err = os.Remove(newpath); nil != err {
return err
}
}
return os.Rename(oldpath, newpath)
}

View File

@@ -0,0 +1,285 @@
package tsm1
/*
This code is originally from: https://github.com/dgryski/go-tsz and has been modified to remove
the timestamp compression fuctionality.
It implements the float compression as presented in: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf.
This implementation uses a sentinel value of NaN which means that float64 NaN cannot be stored using
this version.
*/
import (
"bytes"
"fmt"
"math"
"github.com/dgryski/go-bits"
"github.com/dgryski/go-bitstream"
)
const (
// floatUncompressed is an uncompressed format using 8 bytes per value.
// Not yet implemented.
floatUncompressed = 0
// floatCompressedGorilla is a compressed format using the gorilla paper encoding
floatCompressedGorilla = 1
)
// uvnan is the constant returned from math.NaN().
const uvnan = 0x7FF8000000000001
// FloatEncoder encodes multiple float64s into a byte slice.
type FloatEncoder struct {
val float64
err error
leading uint64
trailing uint64
buf bytes.Buffer
bw *bitstream.BitWriter
first bool
finished bool
}
// NewFloatEncoder returns a new FloatEncoder.
func NewFloatEncoder() *FloatEncoder {
s := FloatEncoder{
first: true,
leading: ^uint64(0),
}
s.bw = bitstream.NewWriter(&s.buf)
s.buf.WriteByte(floatCompressedGorilla << 4)
return &s
}
// Reset sets the encoder back to its initial state.
func (s *FloatEncoder) Reset() {
s.val = 0
s.err = nil
s.leading = ^uint64(0)
s.trailing = 0
s.buf.Reset()
s.buf.WriteByte(floatCompressedGorilla << 4)
s.bw.Resume(0x0, 8)
s.finished = false
s.first = true
}
// Bytes returns a copy of the underlying byte buffer used in the encoder.
func (s *FloatEncoder) Bytes() ([]byte, error) {
return s.buf.Bytes(), s.err
}
// Flush indicates there are no more values to encode.
func (s *FloatEncoder) Flush() {
if !s.finished {
// write an end-of-stream record
s.finished = true
s.Write(math.NaN())
s.bw.Flush(bitstream.Zero)
}
}
// Write encodes v to the underlying buffer.
func (s *FloatEncoder) Write(v float64) {
// Only allow NaN as a sentinel value
if math.IsNaN(v) && !s.finished {
s.err = fmt.Errorf("unsupported value: NaN")
return
}
if s.first {
// first point
s.val = v
s.first = false
s.bw.WriteBits(math.Float64bits(v), 64)
return
}
vDelta := math.Float64bits(v) ^ math.Float64bits(s.val)
if vDelta == 0 {
s.bw.WriteBit(bitstream.Zero)
} else {
s.bw.WriteBit(bitstream.One)
leading := bits.Clz(vDelta)
trailing := bits.Ctz(vDelta)
// Clamp number of leading zeros to avoid overflow when encoding
leading &= 0x1F
if leading >= 32 {
leading = 31
}
// TODO(dgryski): check if it's 'cheaper' to reset the leading/trailing bits instead
if s.leading != ^uint64(0) && leading >= s.leading && trailing >= s.trailing {
s.bw.WriteBit(bitstream.Zero)
s.bw.WriteBits(vDelta>>s.trailing, 64-int(s.leading)-int(s.trailing))
} else {
s.leading, s.trailing = leading, trailing
s.bw.WriteBit(bitstream.One)
s.bw.WriteBits(leading, 5)
// Note that if leading == trailing == 0, then sigbits == 64. But that
// value doesn't actually fit into the 6 bits we have.
// Luckily, we never need to encode 0 significant bits, since that would
// put us in the other case (vdelta == 0). So instead we write out a 0 and
// adjust it back to 64 on unpacking.
sigbits := 64 - leading - trailing
s.bw.WriteBits(sigbits, 6)
s.bw.WriteBits(vDelta>>trailing, int(sigbits))
}
}
s.val = v
}
// FloatDecoder decodes a byte slice into multiple float64 values.
type FloatDecoder struct {
val uint64
leading uint64
trailing uint64
br BitReader
b []byte
first bool
finished bool
err error
}
// SetBytes initializes the decoder with b. Must call before calling Next().
func (it *FloatDecoder) SetBytes(b []byte) error {
var v uint64
if len(b) == 0 {
v = uvnan
} else {
// first byte is the compression type.
// we currently just have gorilla compression.
it.br.Reset(b[1:])
var err error
v, err = it.br.ReadBits(64)
if err != nil {
return err
}
}
// Reset all fields.
it.val = v
it.leading = 0
it.trailing = 0
it.b = b
it.first = true
it.finished = false
it.err = nil
return nil
}
// Next returns true if there are remaining values to read.
func (it *FloatDecoder) Next() bool {
if it.err != nil || it.finished {
return false
}
if it.first {
it.first = false
// mark as finished if there were no values.
if it.val == uvnan { // IsNaN
it.finished = true
return false
}
return true
}
// read compressed value
var bit bool
if it.br.CanReadBitFast() {
bit = it.br.ReadBitFast()
} else if v, err := it.br.ReadBit(); err != nil {
it.err = err
return false
} else {
bit = v
}
if !bit {
// it.val = it.val
} else {
var bit bool
if it.br.CanReadBitFast() {
bit = it.br.ReadBitFast()
} else if v, err := it.br.ReadBit(); err != nil {
it.err = err
return false
} else {
bit = v
}
if !bit {
// reuse leading/trailing zero bits
// it.leading, it.trailing = it.leading, it.trailing
} else {
bits, err := it.br.ReadBits(5)
if err != nil {
it.err = err
return false
}
it.leading = bits
bits, err = it.br.ReadBits(6)
if err != nil {
it.err = err
return false
}
mbits := bits
// 0 significant bits here means we overflowed and we actually need 64; see comment in encoder
if mbits == 0 {
mbits = 64
}
it.trailing = 64 - it.leading - mbits
}
mbits := uint(64 - it.leading - it.trailing)
bits, err := it.br.ReadBits(mbits)
if err != nil {
it.err = err
return false
}
vbits := it.val
vbits ^= (bits << it.trailing)
if vbits == uvnan { // IsNaN
it.finished = true
return false
}
it.val = vbits
}
return true
}
// Values returns the current float64 value.
func (it *FloatDecoder) Values() float64 {
return math.Float64frombits(it.val)
}
// Error returns the current decoding error.
func (it *FloatDecoder) Error() error {
return it.err
}

View File

@@ -0,0 +1,286 @@
package tsm1_test
import (
"math"
"reflect"
"testing"
"testing/quick"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
)
func TestFloatEncoder_Simple(t *testing.T) {
// Example from the paper
s := tsm1.NewFloatEncoder()
s.Write(12)
s.Write(12)
s.Write(24)
// extra tests
// floating point masking/shifting bug
s.Write(13)
s.Write(24)
// delta-of-delta sizes
s.Write(24)
s.Write(24)
s.Write(24)
s.Flush()
b, err := s.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var it tsm1.FloatDecoder
if err := it.SetBytes(b); err != nil {
t.Fatalf("unexpected error creating float decoder: %v", err)
}
want := []float64{
12,
12,
24,
13,
24,
24,
24,
24,
}
for _, w := range want {
if !it.Next() {
t.Fatalf("Next()=false, want true")
}
vv := it.Values()
if w != vv {
t.Errorf("Values()=(%v), want (%v)\n", vv, w)
}
}
if it.Next() {
t.Fatalf("Next()=true, want false")
}
if err := it.Error(); err != nil {
t.Errorf("it.Error()=%v, want nil", err)
}
}
func TestFloatEncoder_SimilarFloats(t *testing.T) {
s := tsm1.NewFloatEncoder()
want := []float64{
6.00065e+06,
6.000656e+06,
6.000657e+06,
6.000659e+06,
6.000661e+06,
}
for _, v := range want {
s.Write(v)
}
s.Flush()
b, err := s.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var it tsm1.FloatDecoder
if err := it.SetBytes(b); err != nil {
t.Fatalf("unexpected error creating float decoder: %v", err)
}
for _, w := range want {
if !it.Next() {
t.Fatalf("Next()=false, want true")
}
vv := it.Values()
if w != vv {
t.Errorf("Values()=(%v), want (%v)\n", vv, w)
}
}
if it.Next() {
t.Fatalf("Next()=true, want false")
}
if err := it.Error(); err != nil {
t.Errorf("it.Error()=%v, want nil", err)
}
}
var TwoHoursData = []struct {
v float64
}{
// 2h of data
{761}, {727}, {763}, {706}, {700},
{679}, {757}, {708}, {739}, {707},
{699}, {740}, {729}, {766}, {730},
{715}, {705}, {693}, {765}, {724},
{799}, {761}, {737}, {766}, {756},
{719}, {722}, {801}, {747}, {731},
{742}, {744}, {791}, {750}, {759},
{809}, {751}, {705}, {770}, {792},
{727}, {762}, {772}, {721}, {748},
{753}, {744}, {716}, {776}, {659},
{789}, {766}, {758}, {690}, {795},
{770}, {758}, {723}, {767}, {765},
{693}, {706}, {681}, {727}, {724},
{780}, {678}, {696}, {758}, {740},
{735}, {700}, {742}, {747}, {752},
{734}, {743}, {732}, {746}, {770},
{780}, {710}, {731}, {712}, {712},
{741}, {770}, {770}, {754}, {718},
{670}, {775}, {749}, {795}, {756},
{741}, {787}, {721}, {745}, {782},
{765}, {780}, {811}, {790}, {836},
{743}, {858}, {739}, {762}, {770},
{752}, {763}, {795}, {792}, {746},
{786}, {785}, {774}, {786}, {718},
}
func TestFloatEncoder_Roundtrip(t *testing.T) {
s := tsm1.NewFloatEncoder()
for _, p := range TwoHoursData {
s.Write(p.v)
}
s.Flush()
b, err := s.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var it tsm1.FloatDecoder
if err := it.SetBytes(b); err != nil {
t.Fatalf("unexpected error creating float decoder: %v", err)
}
for _, w := range TwoHoursData {
if !it.Next() {
t.Fatalf("Next()=false, want true")
}
vv := it.Values()
// t.Logf("it.Values()=(%+v, %+v)\n", time.Unix(int64(tt), 0), vv)
if w.v != vv {
t.Errorf("Values()=(%v), want (%v)\n", vv, w.v)
}
}
if it.Next() {
t.Fatalf("Next()=true, want false")
}
if err := it.Error(); err != nil {
t.Errorf("it.Error()=%v, want nil", err)
}
}
func TestFloatEncoder_Roundtrip_NaN(t *testing.T) {
s := tsm1.NewFloatEncoder()
s.Write(1.0)
s.Write(math.NaN())
s.Write(2.0)
s.Flush()
_, err := s.Bytes()
if err == nil {
t.Fatalf("expected error. got nil")
}
}
func Test_FloatEncoder_Quick(t *testing.T) {
quick.Check(func(values []float64) bool {
expected := values
if values == nil {
expected = []float64{}
}
// Write values to encoder.
enc := tsm1.NewFloatEncoder()
for _, v := range values {
enc.Write(v)
}
enc.Flush()
// Read values out of decoder.
got := make([]float64, 0, len(values))
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec tsm1.FloatDecoder
if err := dec.SetBytes(b); err != nil {
t.Fatal(err)
}
for dec.Next() {
got = append(got, dec.Values())
}
// Verify that input and output values match.
if !reflect.DeepEqual(expected, got) {
t.Fatalf("mismatch:\n\nexp=%#v\n\ngot=%#v\n\n", expected, got)
}
return true
}, nil)
}
func TestFloatDecoder_Empty(t *testing.T) {
var dec tsm1.FloatDecoder
if err := dec.SetBytes([]byte{}); err != nil {
t.Fatalf("unexpected error: %v", err)
}
if dec.Next() {
t.Fatalf("exp next == false, got true")
}
}
func BenchmarkFloatEncoder(b *testing.B) {
for i := 0; i < b.N; i++ {
s := tsm1.NewFloatEncoder()
for _, tt := range TwoHoursData {
s.Write(tt.v)
}
s.Flush()
}
}
func BenchmarkFloatDecoder(b *testing.B) {
s := tsm1.NewFloatEncoder()
for _, tt := range TwoHoursData {
s.Write(tt.v)
}
s.Flush()
bytes, err := s.Bytes()
if err != nil {
b.Fatalf("unexpected error: %v", err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
var it tsm1.FloatDecoder
if err := it.SetBytes(bytes); err != nil {
b.Fatalf("unexpected error creating float decoder: %v", err)
}
for j := 0; j < len(TwoHoursData); it.Next() {
j++
}
}
}

View File

@@ -0,0 +1,324 @@
package tsm1
// Integer encoding uses two different strategies depending on the range of values in
// the uncompressed data. Encoded values are first encoding used zig zag encoding.
// This interleaves positive and negative integers across a range of positive integers.
//
// For example, [-2,-1,0,1] becomes [3,1,0,2]. See
// https://developers.google.com/protocol-buffers/docs/encoding?hl=en#signed-integers
// for more information.
//
// If all the zig zag encoded values are less than 1 << 60 - 1, they are compressed using
// simple8b encoding. If any value is larger than 1 << 60 - 1, the values are stored uncompressed.
//
// Each encoded byte slice contains a 1 byte header followed by multiple 8 byte packed integers
// or 8 byte uncompressed integers. The 4 high bits of the first byte indicate the encoding type
// for the remaining bytes.
//
// There are currently two encoding types that can be used with room for 16 total. These additional
// encoding slots are reserved for future use. One improvement to be made is to use a patched
// encoding such as PFOR if only a small number of values exceed the max compressed value range. This
// should improve compression ratios with very large integers near the ends of the int64 range.
import (
"encoding/binary"
"fmt"
"github.com/jwilder/encoding/simple8b"
)
const (
// intUncompressed is an uncompressed format using 8 bytes per point
intUncompressed = 0
// intCompressedSimple is a bit-packed format using simple8b encoding
intCompressedSimple = 1
// intCompressedRLE is a run-length encoding format
intCompressedRLE = 2
)
// IntegerEncoder encodes int64s into byte slices.
type IntegerEncoder struct {
prev int64
rle bool
values []uint64
}
// NewIntegerEncoder returns a new integer encoder with an initial buffer of values sized at sz.
func NewIntegerEncoder(sz int) IntegerEncoder {
return IntegerEncoder{
rle: true,
values: make([]uint64, 0, sz),
}
}
// Flush is no-op
func (e *IntegerEncoder) Flush() {}
// Reset sets the encoder back to its initial state.
func (e *IntegerEncoder) Reset() {
e.prev = 0
e.rle = true
e.values = e.values[:0]
}
// Write encodes v to the underlying buffers.
func (e *IntegerEncoder) Write(v int64) {
// Delta-encode each value as it's written. This happens before
// ZigZagEncoding because the deltas could be negative.
delta := v - e.prev
e.prev = v
enc := ZigZagEncode(delta)
if len(e.values) > 1 {
e.rle = e.rle && e.values[len(e.values)-1] == enc
}
e.values = append(e.values, enc)
}
// Bytes returns a copy of the underlying buffer.
func (e *IntegerEncoder) Bytes() ([]byte, error) {
// Only run-length encode if it could reduce storage size.
if e.rle && len(e.values) > 2 {
return e.encodeRLE()
}
for _, v := range e.values {
// Value is too large to encode using packed format
if v > simple8b.MaxValue {
return e.encodeUncompressed()
}
}
return e.encodePacked()
}
func (e *IntegerEncoder) encodeRLE() ([]byte, error) {
// Large varints can take up to 10 bytes. We're storing 3 + 1
// type byte.
var b [31]byte
// 4 high bits used for the encoding type
b[0] = byte(intCompressedRLE) << 4
i := 1
// The first value
binary.BigEndian.PutUint64(b[i:], e.values[0])
i += 8
// The first delta
i += binary.PutUvarint(b[i:], e.values[1])
// The number of times the delta is repeated
i += binary.PutUvarint(b[i:], uint64(len(e.values)-1))
return b[:i], nil
}
func (e *IntegerEncoder) encodePacked() ([]byte, error) {
if len(e.values) == 0 {
return nil, nil
}
// Encode all but the first value. Fist value is written unencoded
// using 8 bytes.
encoded, err := simple8b.EncodeAll(e.values[1:])
if err != nil {
return nil, err
}
b := make([]byte, 1+(len(encoded)+1)*8)
// 4 high bits of first byte store the encoding type for the block
b[0] = byte(intCompressedSimple) << 4
// Write the first value since it's not part of the encoded values
binary.BigEndian.PutUint64(b[1:9], e.values[0])
// Write the encoded values
for i, v := range encoded {
binary.BigEndian.PutUint64(b[9+i*8:9+i*8+8], v)
}
return b, nil
}
func (e *IntegerEncoder) encodeUncompressed() ([]byte, error) {
if len(e.values) == 0 {
return nil, nil
}
b := make([]byte, 1+len(e.values)*8)
// 4 high bits of first byte store the encoding type for the block
b[0] = byte(intUncompressed) << 4
for i, v := range e.values {
binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], v)
}
return b, nil
}
// IntegerDecoder decodes a byte slice into int64s.
type IntegerDecoder struct {
// 240 is the maximum number of values that can be encoded into a single uint64 using simple8b
values [240]uint64
bytes []byte
i int
n int
prev int64
first bool
// The first value for a run-length encoded byte slice
rleFirst uint64
// The delta value for a run-length encoded byte slice
rleDelta uint64
encoding byte
err error
}
// SetBytes sets the underlying byte slice of the decoder.
func (d *IntegerDecoder) SetBytes(b []byte) {
if len(b) > 0 {
d.encoding = b[0] >> 4
d.bytes = b[1:]
} else {
d.encoding = 0
d.bytes = nil
}
d.i = 0
d.n = 0
d.prev = 0
d.first = true
d.rleFirst = 0
d.rleDelta = 0
d.err = nil
}
// Next returns true if there are any values remaining to be decoded.
func (d *IntegerDecoder) Next() bool {
if d.i >= d.n && len(d.bytes) == 0 {
return false
}
d.i++
if d.i >= d.n {
switch d.encoding {
case intUncompressed:
d.decodeUncompressed()
case intCompressedSimple:
d.decodePacked()
case intCompressedRLE:
d.decodeRLE()
default:
d.err = fmt.Errorf("unknown encoding %v", d.encoding)
}
}
return d.err == nil && d.i < d.n
}
// Error returns the last error encountered by the decoder.
func (d *IntegerDecoder) Error() error {
return d.err
}
// Read returns the next value from the decoder.
func (d *IntegerDecoder) Read() int64 {
switch d.encoding {
case intCompressedRLE:
return ZigZagDecode(d.rleFirst) + int64(d.i)*ZigZagDecode(d.rleDelta)
default:
v := ZigZagDecode(d.values[d.i])
// v is the delta encoded value, we need to add the prior value to get the original
v = v + d.prev
d.prev = v
return v
}
}
func (d *IntegerDecoder) decodeRLE() {
if len(d.bytes) == 0 {
return
}
if len(d.bytes) < 8 {
d.err = fmt.Errorf("IntegerDecoder: not enough data to decode RLE starting value")
return
}
var i, n int
// Next 8 bytes is the starting value
first := binary.BigEndian.Uint64(d.bytes[i : i+8])
i += 8
// Next 1-10 bytes is the delta value
value, n := binary.Uvarint(d.bytes[i:])
if n <= 0 {
d.err = fmt.Errorf("IntegerDecoder: invalid RLE delta value")
return
}
i += n
// Last 1-10 bytes is how many times the value repeats
count, n := binary.Uvarint(d.bytes[i:])
if n <= 0 {
d.err = fmt.Errorf("IntegerDecoder: invalid RLE repeat value")
return
}
// Store the first value and delta value so we do not need to allocate
// a large values slice. We can compute the value at position d.i on
// demand.
d.rleFirst = first
d.rleDelta = value
d.n = int(count) + 1
d.i = 0
// We've process all the bytes
d.bytes = nil
}
func (d *IntegerDecoder) decodePacked() {
if len(d.bytes) == 0 {
return
}
if len(d.bytes) < 8 {
d.err = fmt.Errorf("IntegerDecoder: not enough data to decode packed value")
return
}
v := binary.BigEndian.Uint64(d.bytes[0:8])
// The first value is always unencoded
if d.first {
d.first = false
d.n = 1
d.values[0] = v
} else {
n, err := simple8b.Decode(&d.values, v)
if err != nil {
// Should never happen, only error that could be returned is if the the value to be decoded was not
// actually encoded by simple8b encoder.
d.err = fmt.Errorf("failed to decode value %v: %v", v, err)
}
d.n = n
}
d.i = 0
d.bytes = d.bytes[8:]
}
func (d *IntegerDecoder) decodeUncompressed() {
if len(d.bytes) == 0 {
return
}
if len(d.bytes) < 8 {
d.err = fmt.Errorf("IntegerDecoder: not enough data to decode uncompressed value")
return
}
d.values[0] = binary.BigEndian.Uint64(d.bytes[0:8])
d.i = 0
d.n = 1
d.bytes = d.bytes[8:]
}

View File

@@ -0,0 +1,646 @@
package tsm1
import (
"math"
"math/rand"
"reflect"
"testing"
"testing/quick"
)
func Test_IntegerEncoder_NoValues(t *testing.T) {
enc := NewIntegerEncoder(0)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(b) > 0 {
t.Fatalf("unexpected lenght: exp 0, got %v", len(b))
}
var dec IntegerDecoder
dec.SetBytes(b)
if dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
}
func Test_IntegerEncoder_One(t *testing.T) {
enc := NewIntegerEncoder(1)
v1 := int64(1)
enc.Write(1)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; intCompressedSimple != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1)
}
}
func Test_IntegerEncoder_Two(t *testing.T) {
enc := NewIntegerEncoder(2)
var v1, v2 int64 = 1, 2
enc.Write(v1)
enc.Write(v2)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; intCompressedSimple != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2)
}
}
func Test_IntegerEncoder_Negative(t *testing.T) {
enc := NewIntegerEncoder(3)
var v1, v2, v3 int64 = -2, 0, 1
enc.Write(v1)
enc.Write(v2)
enc.Write(v3)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; intCompressedSimple != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v3 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v3)
}
}
func Test_IntegerEncoder_Large_Range(t *testing.T) {
enc := NewIntegerEncoder(2)
var v1, v2 int64 = math.MinInt64, math.MaxInt64
enc.Write(v1)
enc.Write(v2)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; intUncompressed != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2)
}
}
func Test_IntegerEncoder_Uncompressed(t *testing.T) {
enc := NewIntegerEncoder(3)
var v1, v2, v3 int64 = 0, 1, 1 << 60
enc.Write(v1)
enc.Write(v2)
enc.Write(v3)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("expected error: %v", err)
}
// 1 byte header + 3 * 8 byte values
if exp := 25; len(b) != exp {
t.Fatalf("length mismatch: got %v, exp %v", len(b), exp)
}
if got := b[0] >> 4; intUncompressed != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v2)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if v3 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), v3)
}
}
func Test_IntegerEncoder_NegativeUncompressed(t *testing.T) {
values := []int64{
-2352281900722994752, 1438442655375607923, -4110452567888190110,
-1221292455668011702, -1941700286034261841, -2836753127140407751,
1432686216250034552, 3663244026151507025, -3068113732684750258,
-1949953187327444488, 3713374280993588804, 3226153669854871355,
-2093273755080502606, 1006087192578600616, -2272122301622271655,
2533238229511593671, -4450454445568858273, 2647789901083530435,
2761419461769776844, -1324397441074946198, -680758138988210958,
94468846694902125, -2394093124890745254, -2682139311758778198,
}
enc := NewIntegerEncoder(256)
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("expected error: %v", err)
}
if got := b[0] >> 4; intUncompressed != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_AllNegative(t *testing.T) {
enc := NewIntegerEncoder(3)
values := []int64{
-10, -5, -1,
}
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; intCompressedSimple != got {
t.Fatalf("encoding type mismatch: exp uncompressed, got %v", got)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_CounterPacked(t *testing.T) {
enc := NewIntegerEncoder(16)
values := []int64{
1e15, 1e15 + 1, 1e15 + 2, 1e15 + 3, 1e15 + 4, 1e15 + 6,
}
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if b[0]>>4 != intCompressedSimple {
t.Fatalf("unexpected encoding format: expected simple, got %v", b[0]>>4)
}
// Should use 1 header byte + 2, 8 byte words if delta-encoding is used based on
// values sizes. Without delta-encoding, we'd get 49 bytes.
if exp := 17; len(b) != exp {
t.Fatalf("encoded length mismatch: got %v, exp %v", len(b), exp)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_CounterRLE(t *testing.T) {
enc := NewIntegerEncoder(16)
values := []int64{
1e15, 1e15 + 1, 1e15 + 2, 1e15 + 3, 1e15 + 4, 1e15 + 5,
}
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if b[0]>>4 != intCompressedRLE {
t.Fatalf("unexpected encoding format: expected RLE, got %v", b[0]>>4)
}
// Should use 1 header byte, 8 byte first value, 1 var-byte for delta and 1 var-byte for
// count of deltas in this particular RLE.
if exp := 11; len(b) != exp {
t.Fatalf("encoded length mismatch: got %v, exp %v", len(b), exp)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_Descending(t *testing.T) {
enc := NewIntegerEncoder(16)
values := []int64{
7094, 4472, 1850,
}
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if b[0]>>4 != intCompressedRLE {
t.Fatalf("unexpected encoding format: expected simple, got %v", b[0]>>4)
}
// Should use 1 header byte, 8 byte first value, 1 var-byte for delta and 1 var-byte for
// count of deltas in this particular RLE.
if exp := 12; len(b) != exp {
t.Fatalf("encoded length mismatch: got %v, exp %v", len(b), exp)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_Flat(t *testing.T) {
enc := NewIntegerEncoder(16)
values := []int64{
1, 1, 1, 1,
}
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if b[0]>>4 != intCompressedRLE {
t.Fatalf("unexpected encoding format: expected simple, got %v", b[0]>>4)
}
// Should use 1 header byte, 8 byte first value, 1 var-byte for delta and 1 var-byte for
// count of deltas in this particular RLE.
if exp := 11; len(b) != exp {
t.Fatalf("encoded length mismatch: got %v, exp %v", len(b), exp)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_MinMax(t *testing.T) {
enc := NewIntegerEncoder(2)
values := []int64{
math.MinInt64, math.MaxInt64,
}
for _, v := range values {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if b[0]>>4 != intUncompressed {
t.Fatalf("unexpected encoding format: expected simple, got %v", b[0]>>4)
}
if exp := 17; len(b) != exp {
t.Fatalf("encoded length mismatch: got %v, exp %v", len(b), exp)
}
var dec IntegerDecoder
dec.SetBytes(b)
i := 0
for dec.Next() {
if i > len(values) {
t.Fatalf("read too many values: got %v, exp %v", i, len(values))
}
if values[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), values[i])
}
i += 1
}
if i != len(values) {
t.Fatalf("failed to read enough values: got %v, exp %v", i, len(values))
}
}
func Test_IntegerEncoder_Quick(t *testing.T) {
quick.Check(func(values []int64) bool {
expected := values
if values == nil {
expected = []int64{} // is this really expected?
}
// Write values to encoder.
enc := NewIntegerEncoder(1024)
for _, v := range values {
enc.Write(v)
}
// Retrieve encoded bytes from encoder.
buf, err := enc.Bytes()
if err != nil {
t.Fatal(err)
}
// Read values out of decoder.
got := make([]int64, 0, len(values))
var dec IntegerDecoder
dec.SetBytes(buf)
for dec.Next() {
if err := dec.Error(); err != nil {
t.Fatal(err)
}
got = append(got, dec.Read())
}
// Verify that input and output values match.
if !reflect.DeepEqual(expected, got) {
t.Fatalf("mismatch:\n\nexp=%#v\n\ngot=%#v\n\n", expected, got)
}
return true
}, nil)
}
func Test_IntegerDecoder_Corrupt(t *testing.T) {
cases := []string{
"", // Empty
"\x00abc", // Uncompressed: less than 8 bytes
"\x10abc", // Packed: less than 8 bytes
"\x20abc", // RLE: less than 8 bytes
"\x2012345678\x90", // RLE: valid starting value but invalid delta value
"\x2012345678\x01\x90", // RLE: valid starting, valid delta value, invalid repeat value
}
for _, c := range cases {
var dec IntegerDecoder
dec.SetBytes([]byte(c))
if dec.Next() {
t.Fatalf("exp next == false, got true")
}
}
}
func BenchmarkIntegerEncoderRLE(b *testing.B) {
enc := NewIntegerEncoder(1024)
x := make([]int64, 1024)
for i := 0; i < len(x); i++ {
x[i] = int64(i)
enc.Write(x[i])
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
enc.Bytes()
}
}
func BenchmarkIntegerEncoderPackedSimple(b *testing.B) {
enc := NewIntegerEncoder(1024)
x := make([]int64, 1024)
for i := 0; i < len(x); i++ {
// Small amount of randomness prevents RLE from being used
x[i] = int64(i) + int64(rand.Intn(10))
enc.Write(x[i])
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
enc.Bytes()
enc.Reset()
for i := 0; i < len(x); i++ {
enc.Write(x[i])
}
}
}
func BenchmarkIntegerDecoderPackedSimple(b *testing.B) {
x := make([]int64, 1024)
enc := NewIntegerEncoder(1024)
for i := 0; i < len(x); i++ {
// Small amount of randomness prevents RLE from being used
x[i] = int64(i) + int64(rand.Intn(10))
enc.Write(x[i])
}
bytes, _ := enc.Bytes()
b.ResetTimer()
var dec IntegerDecoder
for i := 0; i < b.N; i++ {
dec.SetBytes(bytes)
for dec.Next() {
}
}
}
func BenchmarkIntegerDecoderRLE(b *testing.B) {
x := make([]int64, 1024)
enc := NewIntegerEncoder(1024)
for i := 0; i < len(x); i++ {
x[i] = int64(i)
enc.Write(x[i])
}
bytes, _ := enc.Bytes()
b.ResetTimer()
var dec IntegerDecoder
dec.SetBytes(bytes)
for i := 0; i < b.N; i++ {
dec.SetBytes(bytes)
for dec.Next() {
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,578 @@
package tsm1
import (
"sort"
"fmt"
"runtime"
"sync"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/tsdb"
"github.com/uber-go/zap"
)
type cursor interface {
close() error
next() (t int64, v interface{})
}
// cursorAt provides a bufferred cursor interface.
// This required for literal value cursors which don't have a time value.
type cursorAt interface {
close() error
peek() (k int64, v interface{})
nextAt(seek int64) interface{}
}
type nilCursor struct {}
func (nilCursor) next() (int64, interface{}) { return tsdb.EOF, nil }
// bufCursor implements a bufferred cursor.
type bufCursor struct {
cur cursor
buf struct {
key int64
value interface{}
filled bool
}
ascending bool
}
// newBufCursor returns a bufferred wrapper for cur.
func newBufCursor(cur cursor, ascending bool) *bufCursor {
return &bufCursor{cur: cur, ascending: ascending}
}
func (c *bufCursor) close() error {
err := c.cur.close()
c.cur = nil
return err
}
// next returns the buffer, if filled. Otherwise returns the next key/value from the cursor.
func (c *bufCursor) next() (int64, interface{}) {
if c.buf.filled {
k, v := c.buf.key, c.buf.value
c.buf.filled = false
return k, v
}
return c.cur.next()
}
// unread pushes k and v onto the buffer.
func (c *bufCursor) unread(k int64, v interface{}) {
c.buf.key, c.buf.value = k, v
c.buf.filled = true
}
// peek reads next next key/value without removing them from the cursor.
func (c *bufCursor) peek() (k int64, v interface{}) {
k, v = c.next()
c.unread(k, v)
return
}
// nextAt returns the next value where key is equal to seek.
// Skips over any keys that are less than seek.
// If the key doesn't exist then a nil value is returned instead.
func (c *bufCursor) nextAt(seek int64) interface{} {
for {
k, v := c.next()
if k != tsdb.EOF {
if k == seek {
return v
} else if c.ascending && k < seek {
continue
} else if !c.ascending && k > seek {
continue
}
c.unread(k, v)
}
// Return "nil" value for type.
switch c.cur.(type) {
case floatCursor:
return (*float64)(nil)
case integerCursor:
return (*int64)(nil)
case stringCursor:
return (*string)(nil)
case booleanCursor:
return (*bool)(nil)
default:
panic("unreachable")
}
}
}
// statsBufferCopyIntervalN is the number of points that are read before
// copying the stats buffer to the iterator's stats field. This is used to
// amortize the cost of using a mutex when updating stats.
const statsBufferCopyIntervalN = 100
{{range .}}
type {{.name}}FinalizerIterator struct {
influxql.{{.Name}}Iterator
logger zap.Logger
}
func new{{.Name}}FinalizerIterator(inner influxql.{{.Name}}Iterator, logger zap.Logger) *{{.name}}FinalizerIterator {
itr := &{{.name}}FinalizerIterator{ {{.Name}}Iterator: inner, logger: logger}
runtime.SetFinalizer(itr, (*{{.name}}FinalizerIterator).closeGC)
return itr
}
func (itr *{{.name}}FinalizerIterator) closeGC() {
runtime.SetFinalizer(itr, nil)
itr.logger.Error("{{.Name}}Iterator finalized by GC")
itr.Close()
}
func (itr *{{.name}}FinalizerIterator) Close() error {
runtime.SetFinalizer(itr, nil)
return itr.{{.Name}}Iterator.Close()
}
type {{.name}}Iterator struct {
cur {{.name}}Cursor
aux []cursorAt
conds struct {
names []string
curs []cursorAt
}
opt influxql.IteratorOptions
m map[string]interface{} // map used for condition evaluation
point influxql.{{.Name}}Point // reusable buffer
statsLock sync.Mutex
stats influxql.IteratorStats
statsBuf influxql.IteratorStats
}
func new{{.Name}}Iterator(name string, tags influxql.Tags, opt influxql.IteratorOptions, cur {{.name}}Cursor, aux []cursorAt, conds []cursorAt, condNames []string) *{{.name}}Iterator {
itr := &{{.name}}Iterator{
cur: cur,
aux: aux,
opt: opt,
point: influxql.{{.Name}}Point{
Name: name,
Tags: tags,
},
statsBuf: influxql.IteratorStats{
SeriesN: 1,
},
}
itr.stats = itr.statsBuf
if len(aux) > 0 {
itr.point.Aux = make([]interface{}, len(aux))
}
if opt.Condition != nil {
itr.m = make(map[string]interface{}, len(aux)+len(conds))
}
itr.conds.names = condNames
itr.conds.curs = conds
return itr
}
// Next returns the next point from the iterator.
func (itr *{{.name}}Iterator) Next() (*influxql.{{.Name}}Point, error) {
for {
seek := tsdb.EOF
if itr.cur != nil {
// Read from the main cursor if we have one.
itr.point.Time, itr.point.Value = itr.cur.next{{.Name}}()
seek = itr.point.Time
} else {
// Otherwise find lowest aux timestamp.
for i := range itr.aux {
if k, _ := itr.aux[i].peek(); k != tsdb.EOF {
if seek == tsdb.EOF || (itr.opt.Ascending && k < seek) || (!itr.opt.Ascending && k > seek) {
seek = k
}
}
}
itr.point.Time = seek
}
// Exit if we have no more points or we are outside our time range.
if itr.point.Time == tsdb.EOF {
itr.copyStats()
return nil, nil
} else if itr.opt.Ascending && itr.point.Time > itr.opt.EndTime {
itr.copyStats()
return nil, nil
} else if !itr.opt.Ascending && itr.point.Time < itr.opt.StartTime {
itr.copyStats()
return nil, nil
}
// Read from each auxiliary cursor.
for i := range itr.opt.Aux {
itr.point.Aux[i] = itr.aux[i].nextAt(seek)
}
// Read from condition field cursors.
for i := range itr.conds.curs {
itr.m[itr.conds.names[i]] = itr.conds.curs[i].nextAt(seek)
}
// Evaluate condition, if one exists. Retry if it fails.
if itr.opt.Condition != nil && !influxql.EvalBool(itr.opt.Condition, itr.m) {
continue
}
// Track points returned.
itr.statsBuf.PointN++
// Copy buffer to stats periodically.
if itr.statsBuf.PointN % statsBufferCopyIntervalN == 0 {
itr.copyStats()
}
return &itr.point, nil
}
}
// copyStats copies from the itr stats buffer to the stats under lock.
func (itr *{{.name}}Iterator) copyStats() {
itr.statsLock.Lock()
itr.stats = itr.statsBuf
itr.statsLock.Unlock()
}
// Stats returns stats on the points processed.
func (itr *{{.name}}Iterator) Stats() influxql.IteratorStats {
itr.statsLock.Lock()
stats := itr.stats
itr.statsLock.Unlock()
return stats
}
// Close closes the iterator.
func (itr *{{.name}}Iterator) Close() error {
cursorsAt(itr.aux).close()
itr.aux = nil
cursorsAt(itr.conds.curs).close()
itr.conds.curs = nil
if itr.cur != nil {
err := itr.cur.close()
itr.cur = nil
return err
}
return nil
}
// {{.name}}LimitIterator
type {{.name}}LimitIterator struct {
input influxql.{{.Name}}Iterator
opt influxql.IteratorOptions
n int
}
func new{{.Name}}LimitIterator(input influxql.{{.Name}}Iterator, opt influxql.IteratorOptions) *{{.name}}LimitIterator {
return &{{.name}}LimitIterator{
input: input,
opt: opt,
}
}
func (itr *{{.name}}LimitIterator) Stats() influxql.IteratorStats { return itr.input.Stats() }
func (itr *{{.name}}LimitIterator) Close() error { return itr.input.Close() }
func (itr *{{.name}}LimitIterator) Next() (*influxql.{{.Name}}Point, error) {
// Check if we are beyond the limit.
if (itr.n-itr.opt.Offset) > itr.opt.Limit {
return nil, nil
}
// Read the next point.
p, err := itr.input.Next()
if p == nil || err != nil {
return nil, err
}
// Increment counter.
itr.n++
// Offsets are handled by a higher level iterator so return all points.
return p, nil
}
// {{.name}}Cursor represents an object for iterating over a single {{.name}} field.
type {{.name}}Cursor interface {
cursor
next{{.Name}}() (t int64, v {{.Type}})
}
func new{{.Name}}Cursor(seek int64, ascending bool, cacheValues Values, tsmKeyCursor *KeyCursor) {{.name}}Cursor {
if ascending {
return new{{.Name}}AscendingCursor(seek, cacheValues, tsmKeyCursor)
}
return new{{.Name}}DescendingCursor(seek, cacheValues, tsmKeyCursor)
}
type {{.name}}AscendingCursor struct {
cache struct {
values Values
pos int
}
tsm struct {
buf []{{.Name}}Value
values []{{.Name}}Value
pos int
keyCursor *KeyCursor
}
}
func new{{.Name}}AscendingCursor(seek int64, cacheValues Values, tsmKeyCursor *KeyCursor) *{{.name}}AscendingCursor {
c := &{{.name}}AscendingCursor{}
c.cache.values = cacheValues
c.cache.pos = sort.Search(len(c.cache.values), func(i int) bool {
return c.cache.values[i].UnixNano() >= seek
})
c.tsm.keyCursor = tsmKeyCursor
c.tsm.buf = make([]{{.Name}}Value, 10)
c.tsm.values, _ = c.tsm.keyCursor.Read{{.Name}}Block(&c.tsm.buf)
c.tsm.pos = sort.Search(len(c.tsm.values), func(i int) bool {
return c.tsm.values[i].UnixNano() >= seek
})
return c
}
// peekCache returns the current time/value from the cache.
func (c *{{.name}}AscendingCursor) peekCache() (t int64, v {{.Type}}) {
if c.cache.pos >= len(c.cache.values) {
return tsdb.EOF, {{.Nil}}
}
item := c.cache.values[c.cache.pos]
return item.UnixNano(), item.({{.ValueType}}).value
}
// peekTSM returns the current time/value from tsm.
func (c *{{.name}}AscendingCursor) peekTSM() (t int64, v {{.Type}}) {
if c.tsm.pos < 0 || c.tsm.pos >= len(c.tsm.values) {
return tsdb.EOF, {{.Nil}}
}
item := c.tsm.values[c.tsm.pos]
return item.UnixNano(), item.value
}
// close closes the cursor and any dependent cursors.
func (c *{{.name}}AscendingCursor) close() (error) {
c.tsm.keyCursor.Close()
c.tsm.keyCursor = nil
c.tsm.buf = nil
c.cache.values = nil
c.tsm.values = nil
return nil
}
// next returns the next key/value for the cursor.
func (c *{{.name}}AscendingCursor) next() (int64, interface{}) { return c.next{{.Name}}() }
// next{{.Name}} returns the next key/value for the cursor.
func (c *{{.name}}AscendingCursor) next{{.Name}}() (int64, {{.Type}}) {
ckey, cvalue := c.peekCache()
tkey, tvalue := c.peekTSM()
// No more data in cache or in TSM files.
if ckey == tsdb.EOF && tkey == tsdb.EOF {
return tsdb.EOF, {{.Nil}}
}
// Both cache and tsm files have the same key, cache takes precedence.
if ckey == tkey {
c.nextCache()
c.nextTSM()
return ckey, cvalue
}
// Buffered cache key precedes that in TSM file.
if ckey != tsdb.EOF && (ckey < tkey || tkey == tsdb.EOF) {
c.nextCache()
return ckey, cvalue
}
// Buffered TSM key precedes that in cache.
c.nextTSM()
return tkey, tvalue
}
// nextCache returns the next value from the cache.
func (c *{{.name}}AscendingCursor) nextCache() {
if c.cache.pos >= len(c.cache.values) {
return
}
c.cache.pos++
}
// nextTSM returns the next value from the TSM files.
func (c *{{.name}}AscendingCursor) nextTSM() {
c.tsm.pos++
if c.tsm.pos >= len(c.tsm.values) {
c.tsm.keyCursor.Next()
c.tsm.values, _ = c.tsm.keyCursor.Read{{.Name}}Block(&c.tsm.buf)
if len(c.tsm.values) == 0 {
return
}
c.tsm.pos = 0
}
}
type {{.name}}DescendingCursor struct {
cache struct {
values Values
pos int
}
tsm struct {
buf []{{.Name}}Value
values []{{.Name}}Value
pos int
keyCursor *KeyCursor
}
}
func new{{.Name}}DescendingCursor(seek int64, cacheValues Values, tsmKeyCursor *KeyCursor) *{{.name}}DescendingCursor {
c := &{{.name}}DescendingCursor{}
c.cache.values = cacheValues
c.cache.pos = sort.Search(len(c.cache.values), func(i int) bool {
return c.cache.values[i].UnixNano() >= seek
})
if t, _ := c.peekCache(); t != seek {
c.cache.pos--
}
c.tsm.keyCursor = tsmKeyCursor
c.tsm.buf = make([]{{.Name}}Value, 10)
c.tsm.values, _ = c.tsm.keyCursor.Read{{.Name}}Block(&c.tsm.buf)
c.tsm.pos = sort.Search(len(c.tsm.values), func(i int) bool {
return c.tsm.values[i].UnixNano() >= seek
})
if t, _ := c.peekTSM(); t != seek {
c.tsm.pos--
}
return c
}
// peekCache returns the current time/value from the cache.
func (c *{{.name}}DescendingCursor) peekCache() (t int64, v {{.Type}}) {
if c.cache.pos < 0 || c.cache.pos >= len(c.cache.values) {
return tsdb.EOF, {{.Nil}}
}
item := c.cache.values[c.cache.pos]
return item.UnixNano(), item.({{.ValueType}}).value
}
// peekTSM returns the current time/value from tsm.
func (c *{{.name}}DescendingCursor) peekTSM() (t int64, v {{.Type}}) {
if c.tsm.pos < 0 || c.tsm.pos >= len(c.tsm.values) {
return tsdb.EOF, {{.Nil}}
}
item := c.tsm.values[c.tsm.pos]
return item.UnixNano(), item.value
}
// close closes the cursor and any dependent cursors.
func (c *{{.name}}DescendingCursor) close() (error) {
c.tsm.keyCursor.Close()
c.tsm.keyCursor = nil
c.tsm.buf = nil
c.cache.values = nil
c.tsm.values = nil
return nil
}
// next returns the next key/value for the cursor.
func (c *{{.name}}DescendingCursor) next() (int64, interface{}) { return c.next{{.Name}}() }
// next{{.Name}} returns the next key/value for the cursor.
func (c *{{.name}}DescendingCursor) next{{.Name}}() (int64, {{.Type}}) {
ckey, cvalue := c.peekCache()
tkey, tvalue := c.peekTSM()
// No more data in cache or in TSM files.
if ckey == tsdb.EOF && tkey == tsdb.EOF {
return tsdb.EOF, {{.Nil}}
}
// Both cache and tsm files have the same key, cache takes precedence.
if ckey == tkey {
c.nextCache()
c.nextTSM()
return ckey, cvalue
}
// Buffered cache key precedes that in TSM file.
if ckey != tsdb.EOF && (ckey > tkey || tkey == tsdb.EOF) {
c.nextCache()
return ckey, cvalue
}
// Buffered TSM key precedes that in cache.
c.nextTSM()
return tkey, tvalue
}
// nextCache returns the next value from the cache.
func (c *{{.name}}DescendingCursor) nextCache() {
if c.cache.pos < 0 {
return
}
c.cache.pos--
}
// nextTSM returns the next value from the TSM files.
func (c *{{.name}}DescendingCursor) nextTSM() {
c.tsm.pos--
if c.tsm.pos < 0 {
c.tsm.keyCursor.Next()
c.tsm.values, _ = c.tsm.keyCursor.Read{{.Name}}Block(&c.tsm.buf)
if len(c.tsm.values) == 0 {
return
}
c.tsm.pos = len(c.tsm.values) - 1
}
}
// {{.name}}LiteralCursor represents a cursor that always returns a single value.
// It doesn't not have a time value so it can only be used with nextAt().
type {{.name}}LiteralCursor struct {
value {{.Type}}
}
func (c *{{.name}}LiteralCursor) close() error { return nil }
func (c *{{.name}}LiteralCursor) peek() (t int64, v interface{}) { return tsdb.EOF, c.value }
func (c *{{.name}}LiteralCursor) next() (t int64, v interface{}) { return tsdb.EOF, c.value }
func (c *{{.name}}LiteralCursor) nextAt(seek int64) interface{} { return c.value }
// {{.name}}NilLiteralCursor represents a cursor that always returns a typed nil value.
// It doesn't not have a time value so it can only be used with nextAt().
type {{.name}}NilLiteralCursor struct {}
func (c *{{.name}}NilLiteralCursor) close() error { return nil }
func (c *{{.name}}NilLiteralCursor) peek() (t int64, v interface{}) { return tsdb.EOF, (*{{.Type}})(nil) }
func (c *{{.name}}NilLiteralCursor) next() (t int64, v interface{}) { return tsdb.EOF, (*{{.Type}})(nil) }
func (c *{{.name}}NilLiteralCursor) nextAt(seek int64) interface{} { return (*{{.Type}})(nil) }
{{end}}
var _ = fmt.Print

View File

@@ -0,0 +1,30 @@
[
{
"Name":"Float",
"name":"float",
"Type":"float64",
"ValueType":"FloatValue",
"Nil":"0"
},
{
"Name":"Integer",
"name":"integer",
"Type":"int64",
"ValueType":"IntegerValue",
"Nil":"0"
},
{
"Name":"String",
"name":"string",
"Type":"string",
"ValueType":"StringValue",
"Nil":"\"\""
},
{
"Name":"Boolean",
"name":"boolean",
"Type":"bool",
"ValueType":"BooleanValue",
"Nil":"false"
}
]

View File

@@ -0,0 +1,92 @@
package tsm1
import (
"fmt"
"github.com/influxdata/influxdb/influxql"
"github.com/uber-go/zap"
)
func newLimitIterator(input influxql.Iterator, opt influxql.IteratorOptions) influxql.Iterator {
switch input := input.(type) {
case influxql.FloatIterator:
return newFloatLimitIterator(input, opt)
case influxql.IntegerIterator:
return newIntegerLimitIterator(input, opt)
case influxql.StringIterator:
return newStringLimitIterator(input, opt)
case influxql.BooleanIterator:
return newBooleanLimitIterator(input, opt)
default:
panic(fmt.Sprintf("unsupported limit iterator type: %T", input))
}
}
type floatCastIntegerCursor struct {
cursor integerCursor
}
func (c *floatCastIntegerCursor) close() error { return c.cursor.close() }
func (c *floatCastIntegerCursor) next() (t int64, v interface{}) { return c.nextFloat() }
func (c *floatCastIntegerCursor) nextFloat() (int64, float64) {
t, v := c.cursor.nextInteger()
return t, float64(v)
}
type integerCastFloatCursor struct {
cursor floatCursor
}
func (c *integerCastFloatCursor) close() error { return c.cursor.close() }
func (c *integerCastFloatCursor) next() (t int64, v interface{}) { return c.nextInteger() }
func (c *integerCastFloatCursor) nextInteger() (int64, int64) {
t, v := c.cursor.nextFloat()
return t, int64(v)
}
type cursorsAt []cursorAt
func (c cursorsAt) close() {
for _, cur := range c {
cur.close()
}
}
// newMergeFinalizerIterator creates a new Merge iterator from the inputs. If the call to Merge succeeds,
// the resulting Iterator will be wrapped in a finalizer iterator.
// If Merge returns an error, the inputs will be closed.
func newMergeFinalizerIterator(inputs []influxql.Iterator, opt influxql.IteratorOptions, log zap.Logger) (influxql.Iterator, error) {
itr, err := influxql.Iterators(inputs).Merge(opt)
if err != nil {
influxql.Iterators(inputs).Close()
return nil, err
}
return newFinalizerIterator(itr, log), nil
}
// newFinalizerIterator creates a new iterator that installs a runtime finalizer
// to ensure close is eventually called if the iterator is garbage collected.
// This additional guard attempts to protect against clients of CreateIterator not
// correctly closing them and leaking cursors.
func newFinalizerIterator(itr influxql.Iterator, log zap.Logger) influxql.Iterator {
if itr == nil {
return nil
}
switch inner := itr.(type) {
case influxql.FloatIterator:
return newFloatFinalizerIterator(inner, log)
case influxql.IntegerIterator:
return newIntegerFinalizerIterator(inner, log)
case influxql.StringIterator:
return newStringFinalizerIterator(inner, log)
case influxql.BooleanIterator:
return newBooleanFinalizerIterator(inner, log)
default:
panic(fmt.Sprintf("unsupported finalizer iterator type: %T", itr))
}
}

View File

@@ -0,0 +1,32 @@
// +build solaris
package tsm1
import (
"os"
"syscall"
"golang.org/x/sys/unix"
)
func mmap(f *os.File, offset int64, length int) ([]byte, error) {
mmap, err := unix.Mmap(int(f.Fd()), 0, length, syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return nil, err
}
if err := unix.Madvise(mmap, syscall.MADV_RANDOM); err != nil {
return nil, err
}
return mmap, nil
}
func munmap(b []byte) (err error) {
return unix.Munmap(b)
}
// From: github.com/boltdb/bolt/bolt_unix.go
func madvise(b []byte, advice int) (err error) {
return unix.Madvise(b, advice)
}

View File

@@ -0,0 +1,21 @@
// +build !windows,!plan9,!solaris
package tsm1
import (
"os"
"syscall"
)
func mmap(f *os.File, offset int64, length int) ([]byte, error) {
mmap, err := syscall.Mmap(int(f.Fd()), 0, length, syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return nil, err
}
return mmap, nil
}
func munmap(b []byte) (err error) {
return syscall.Munmap(b)
}

View File

@@ -0,0 +1,117 @@
package tsm1
import (
"errors"
"os"
"reflect"
"sync"
"syscall"
"unsafe"
)
// mmap implementation for Windows
// Based on: https://github.com/edsrzf/mmap-go
// Based on: https://github.com/boltdb/bolt/bolt_windows.go
// Ref: https://groups.google.com/forum/#!topic/golang-nuts/g0nLwQI9www
// We keep this map so that we can get back the original handle from the memory address.
var handleLock sync.Mutex
var handleMap = map[uintptr]syscall.Handle{}
var fileMap = map[uintptr]*os.File{}
func openSharedFile(f *os.File) (file *os.File, err error) {
var access, createmode, sharemode uint32
var sa *syscall.SecurityAttributes
access = syscall.GENERIC_READ
sharemode = uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE | syscall.FILE_SHARE_DELETE)
createmode = syscall.OPEN_EXISTING
fileName := f.Name()
pathp, err := syscall.UTF16PtrFromString(fileName)
if err != nil {
return nil, err
}
h, e := syscall.CreateFile(pathp, access, sharemode, sa, createmode, syscall.FILE_ATTRIBUTE_NORMAL, 0)
if e != nil {
return nil, e
}
//NewFile does not add finalizer, need to close this manually
return os.NewFile(uintptr(h), fileName), nil
}
func mmap(f *os.File, offset int64, length int) (out []byte, err error) {
// Open a file mapping handle.
sizelo := uint32(length >> 32)
sizehi := uint32(length) & 0xffffffff
sharedHandle, errno := openSharedFile(f)
if errno != nil {
return nil, os.NewSyscallError("CreateFile", errno)
}
h, errno := syscall.CreateFileMapping(syscall.Handle(sharedHandle.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil)
if h == 0 {
return nil, os.NewSyscallError("CreateFileMapping", errno)
}
// Create the memory map.
addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(length))
if addr == 0 {
return nil, os.NewSyscallError("MapViewOfFile", errno)
}
handleLock.Lock()
handleMap[addr] = h
fileMap[addr] = sharedHandle
handleLock.Unlock()
// Convert to a byte array.
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&out))
hdr.Data = uintptr(unsafe.Pointer(addr))
hdr.Len = length
hdr.Cap = length
return
}
// munmap Windows implementation
// Based on: https://github.com/edsrzf/mmap-go
// Based on: https://github.com/boltdb/bolt/bolt_windows.go
func munmap(b []byte) (err error) {
handleLock.Lock()
defer handleLock.Unlock()
addr := (uintptr)(unsafe.Pointer(&b[0]))
if err := syscall.UnmapViewOfFile(addr); err != nil {
return os.NewSyscallError("UnmapViewOfFile", err)
}
handle, ok := handleMap[addr]
if !ok {
// should be impossible; we would've seen the error above
return errors.New("unknown base address")
}
delete(handleMap, addr)
e := syscall.CloseHandle(syscall.Handle(handle))
if e != nil {
return os.NewSyscallError("CloseHandle", e)
}
file, ok := fileMap[addr]
if !ok {
// should be impossible; we would've seen the error above
return errors.New("unknown base address")
}
delete(fileMap, addr)
e = file.Close()
if e != nil {
return errors.New("close file" + e.Error())
}
return nil
}

View File

@@ -0,0 +1,26 @@
package tsm1
import "sync"
var bufPool sync.Pool
// getBuf returns a buffer with length size from the buffer pool.
func getBuf(size int) *[]byte {
x := bufPool.Get()
if x == nil {
b := make([]byte, size)
return &b
}
buf := x.(*[]byte)
if cap(*buf) < size {
b := make([]byte, size)
return &b
}
*buf = (*buf)[:size]
return buf
}
// putBuf returns a buffer to the pool.
func putBuf(buf *[]byte) {
bufPool.Put(buf)
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,299 @@
package tsm1
import (
"fmt"
"sort"
"sync"
"sync/atomic"
"github.com/cespare/xxhash"
)
// partitions is the number of partitions we used in the ring's continuum. It
// basically defines the maximum number of partitions you can have in the ring.
// If a smaller number of partitions are chosen when creating a ring, then
// they're evenly spread across this many partitions in the ring.
const partitions = 4096
// ring is a structure that maps series keys to entries.
//
// ring is implemented as a crude hash ring, in so much that you can have
// variable numbers of members in the ring, and the appropriate member for a
// given series key can always consistently be found. Unlike a true hash ring
// though, this ring is not resizeable—there must be at most 256 members in the
// ring, and the number of members must always be a power of 2.
//
// ring works as follows: Each member of the ring contains a single store, which
// contains a map of series keys to entries. A ring always has 256 partitions,
// and a member takes up one or more of these partitions (depending on how many
// members are specified to be in the ring)
//
// To determine the partition that a series key should be added to, the series
// key is hashed and the first 8 bits are used as an index to the ring.
//
type ring struct {
// The unique set of partitions in the ring.
// len(partitions) <= len(continuum)
partitions []*partition
// A mapping of partition to location on the ring continuum. This is used
// to lookup a partition.
continuum []*partition
// Number of keys within the ring. This is used to provide a hint for
// allocating the return values in keys(). It will not be perfectly accurate
// since it doesn't consider adding duplicate keys, or trying to remove non-
// existent keys.
keysHint int64
}
// newring returns a new ring initialised with n partitions. n must always be a
// power of 2, and for performance reasons should be larger than the number of
// cores on the host. The supported set of values for n is:
//
// {1, 2, 4, 8, 16, 32, 64, 128, 256}.
//
func newring(n int) (*ring, error) {
if n <= 0 || n > partitions {
return nil, fmt.Errorf("invalid number of paritions: %d", n)
}
r := ring{
continuum: make([]*partition, partitions), // maximum number of partitions.
}
// The trick here is to map N partitions to all points on the continuum,
// such that the first eight bits of a given hash will map directly to one
// of the N partitions.
for i := 0; i < len(r.continuum); i++ {
if (i == 0 || i%(partitions/n) == 0) && len(r.partitions) < n {
r.partitions = append(r.partitions, &partition{
store: make(map[string]*entry),
entrySizeHints: make(map[uint64]int),
})
}
r.continuum[i] = r.partitions[len(r.partitions)-1]
}
return &r, nil
}
// reset resets the ring so it can be reused. Before removing references to entries
// within each partition it gathers sizing information to provide hints when
// reallocating entries in partition maps.
//
// reset is not safe for use by multiple goroutines.
func (r *ring) reset() {
for _, partition := range r.partitions {
partition.reset()
}
r.keysHint = 0
}
// getPartition retrieves the hash ring partition associated with the provided
// key.
func (r *ring) getPartition(key string) *partition {
return r.continuum[int(xxhash.Sum64([]byte(key))%partitions)]
}
// entry returns the entry for the given key.
// entry is safe for use by multiple goroutines.
func (r *ring) entry(key string) (*entry, bool) {
return r.getPartition(key).entry(key)
}
// write writes values to the entry in the ring's partition associated with key.
// If no entry exists for the key then one will be created.
// write is safe for use by multiple goroutines.
func (r *ring) write(key string, values Values) error {
return r.getPartition(key).write(key, values)
}
// add adds an entry to the ring.
func (r *ring) add(key string, entry *entry) {
r.getPartition(key).add(key, entry)
atomic.AddInt64(&r.keysHint, 1)
}
// remove deletes the entry for the given key.
// remove is safe for use by multiple goroutines.
func (r *ring) remove(key string) {
r.getPartition(key).remove(key)
if r.keysHint > 0 {
atomic.AddInt64(&r.keysHint, -1)
}
}
// keys returns all the keys from all partitions in the hash ring. The returned
// keys will be in order if sorted is true.
func (r *ring) keys(sorted bool) []string {
keys := make([]string, 0, atomic.LoadInt64(&r.keysHint))
for _, p := range r.partitions {
keys = append(keys, p.keys()...)
}
if sorted {
sort.Strings(keys)
}
return keys
}
// apply applies the provided function to every entry in the ring under a read
// lock using a separate goroutine for each partition. The provided function
// will be called with each key and the corresponding entry. The first error
// encountered will be returned, if any. apply is safe for use by multiple
// goroutines.
func (r *ring) apply(f func(string, *entry) error) error {
var (
wg sync.WaitGroup
res = make(chan error, len(r.partitions))
)
for _, p := range r.partitions {
wg.Add(1)
go func(p *partition) {
defer wg.Done()
p.mu.RLock()
for k, e := range p.store {
if err := f(k, e); err != nil {
res <- err
p.mu.RUnlock()
return
}
}
p.mu.RUnlock()
}(p)
}
go func() {
wg.Wait()
close(res)
}()
// Collect results.
for err := range res {
if err != nil {
return err
}
}
return nil
}
// applySerial is similar to apply, but invokes f on each partition in the same
// goroutine.
// apply is safe for use by multiple goroutines.
func (r *ring) applySerial(f func(string, *entry) error) error {
for _, p := range r.partitions {
p.mu.RLock()
for k, e := range p.store {
if err := f(k, e); err != nil {
p.mu.RUnlock()
return err
}
}
p.mu.RUnlock()
}
return nil
}
// partition provides safe access to a map of series keys to entries.
type partition struct {
mu sync.RWMutex
store map[string]*entry
// entrySizeHints stores hints for appropriate sizes to pre-allocate the
// []Values in an entry. entrySizeHints will only contain hints for entries
// that were present prior to the most recent snapshot, preventing unbounded
// growth over time.
entrySizeHints map[uint64]int
}
// entry returns the partition's entry for the provided key.
// It's safe for use by multiple goroutines.
func (p *partition) entry(key string) (*entry, bool) {
p.mu.RLock()
e, ok := p.store[key]
p.mu.RUnlock()
return e, ok
}
// write writes the values to the entry in the partition, creating the entry
// if it does not exist.
// write is safe for use by multiple goroutines.
func (p *partition) write(key string, values Values) error {
p.mu.RLock()
e, ok := p.store[key]
p.mu.RUnlock()
if ok {
// Hot path.
return e.add(values)
}
p.mu.Lock()
defer p.mu.Unlock()
// Check again.
if e, ok = p.store[key]; ok {
return e.add(values)
}
// Create a new entry using a preallocated size if we have a hint available.
hint, _ := p.entrySizeHints[xxhash.Sum64([]byte(key))]
e, err := newEntryValues(values, hint)
if err != nil {
return err
}
p.store[key] = e
return nil
}
// add adds a new entry for key to the partition.
func (p *partition) add(key string, entry *entry) {
p.mu.Lock()
p.store[key] = entry
p.mu.Unlock()
}
// remove deletes the entry associated with the provided key.
// remove is safe for use by multiple goroutines.
func (p *partition) remove(key string) {
p.mu.Lock()
delete(p.store, key)
p.mu.Unlock()
}
// keys returns an unsorted slice of the keys in the partition.
func (p *partition) keys() []string {
p.mu.RLock()
keys := make([]string, 0, len(p.store))
for k := range p.store {
keys = append(keys, k)
}
p.mu.RUnlock()
return keys
}
// reset resets the partition by reinitialising the store. reset returns hints
// about sizes that the entries within the store could be reallocated with.
func (p *partition) reset() {
p.mu.Lock()
defer p.mu.Unlock()
// Collect the allocated sizes of values for each entry in the store.
p.entrySizeHints = make(map[uint64]int)
for k, entry := range p.store {
// If the capacity is large then there are many values in the entry.
// Store a hint to pre-allocate the next time we see the same entry.
entry.mu.RLock()
if cap(entry.values) > 128 { // 4 x the default entry capacity size.
p.entrySizeHints[xxhash.Sum64([]byte(k))] = cap(entry.values)
}
entry.mu.RUnlock()
}
// Reset the store.
p.store = make(map[string]*entry, len(p.store))
}

View File

@@ -0,0 +1,122 @@
package tsm1
import (
"fmt"
"runtime"
"sync"
"testing"
)
func TestRing_newRing(t *testing.T) {
examples := []struct {
n int
returnErr bool
}{
{n: 1}, {n: 2}, {n: 4}, {n: 8}, {n: 16}, {n: 32}, {n: 64}, {n: 128}, {n: 256},
{n: 0, returnErr: true}, {n: 3, returnErr: true}, {n: 512, returnErr: true},
}
for i, example := range examples {
r, err := newring(example.n)
if err != nil {
if example.returnErr {
continue // expecting an error.
}
t.Fatal(err)
}
if got, exp := len(r.partitions), example.n; got != exp {
t.Fatalf("[Example %d] got %v, expected %v", i, got, exp)
}
// Check partitions distributed correctly
partitions := make([]*partition, 0)
for i, partition := range r.continuum {
if i == 0 || partition != partitions[len(partitions)-1] {
partitions = append(partitions, partition)
}
}
if got, exp := len(partitions), example.n; got != exp {
t.Fatalf("[Example %d] got %v, expected %v", i, got, exp)
}
}
}
var strSliceRes []string
func benchmarkRingkeys(b *testing.B, r *ring, keys int) {
// Add some keys
for i := 0; i < keys; i++ {
r.add(fmt.Sprintf("cpu,host=server-%d value=1", i), nil)
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
strSliceRes = r.keys(false)
}
}
func BenchmarkRing_keys_100(b *testing.B) { benchmarkRingkeys(b, MustNewRing(256), 100) }
func BenchmarkRing_keys_1000(b *testing.B) { benchmarkRingkeys(b, MustNewRing(256), 1000) }
func BenchmarkRing_keys_10000(b *testing.B) { benchmarkRingkeys(b, MustNewRing(256), 10000) }
func BenchmarkRing_keys_100000(b *testing.B) { benchmarkRingkeys(b, MustNewRing(256), 100000) }
func benchmarkRingWrite(b *testing.B, r *ring, n int) {
for i := 0; i < b.N; i++ {
var wg sync.WaitGroup
for i := 0; i < runtime.GOMAXPROCS(0); i++ {
errC := make(chan error)
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < n; j++ {
if err := r.write(fmt.Sprintf("cpu,host=server-%d value=1", j), Values{}); err != nil {
errC <- err
}
}
}()
go func() {
wg.Wait()
close(errC)
}()
for err := range errC {
if err != nil {
b.Error(err)
}
}
}
}
}
func BenchmarkRing_write_1_100(b *testing.B) { benchmarkRingWrite(b, MustNewRing(1), 100) }
func BenchmarkRing_write_1_1000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(1), 1000) }
func BenchmarkRing_write_1_10000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(1), 10000) }
func BenchmarkRing_write_1_100000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(1), 100000) }
func BenchmarkRing_write_4_100(b *testing.B) { benchmarkRingWrite(b, MustNewRing(4), 100) }
func BenchmarkRing_write_4_1000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(4), 1000) }
func BenchmarkRing_write_4_10000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(4), 10000) }
func BenchmarkRing_write_4_100000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(4), 100000) }
func BenchmarkRing_write_32_100(b *testing.B) { benchmarkRingWrite(b, MustNewRing(32), 100) }
func BenchmarkRing_write_32_1000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(32), 1000) }
func BenchmarkRing_write_32_10000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(32), 10000) }
func BenchmarkRing_write_32_100000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(32), 100000) }
func BenchmarkRing_write_128_100(b *testing.B) { benchmarkRingWrite(b, MustNewRing(128), 100) }
func BenchmarkRing_write_128_1000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(128), 1000) }
func BenchmarkRing_write_128_10000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(128), 10000) }
func BenchmarkRing_write_128_100000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(256), 100000) }
func BenchmarkRing_write_256_100(b *testing.B) { benchmarkRingWrite(b, MustNewRing(256), 100) }
func BenchmarkRing_write_256_1000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(256), 1000) }
func BenchmarkRing_write_256_10000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(256), 10000) }
func BenchmarkRing_write_256_100000(b *testing.B) { benchmarkRingWrite(b, MustNewRing(256), 100000) }
func MustNewRing(n int) *ring {
r, err := newring(n)
if err != nil {
panic(err)
}
return r
}

View File

@@ -0,0 +1,133 @@
package tsm1
// String encoding uses snappy compression to compress each string. Each string is
// appended to byte slice prefixed with a variable byte length followed by the string
// bytes. The bytes are compressed using snappy compressor and a 1 byte header is used
// to indicate the type of encoding.
import (
"encoding/binary"
"fmt"
"github.com/golang/snappy"
)
const (
// stringUncompressed is a an uncompressed format encoding strings as raw bytes.
// Not yet implemented.
stringUncompressed = 0
// stringCompressedSnappy is a compressed encoding using Snappy compression
stringCompressedSnappy = 1
)
// StringEncoder encodes multiple strings into a byte slice.
type StringEncoder struct {
// The encoded bytes
bytes []byte
}
// NewStringEncoder returns a new StringEncoder with an initial buffer ready to hold sz bytes.
func NewStringEncoder(sz int) StringEncoder {
return StringEncoder{
bytes: make([]byte, 0, sz),
}
}
// Flush is no-op
func (e *StringEncoder) Flush() {}
// Reset sets the encoder back to its initial state.
func (e *StringEncoder) Reset() {
e.bytes = e.bytes[:0]
}
// Write encodes s to the underlying buffer.
func (e *StringEncoder) Write(s string) {
b := make([]byte, 10)
// Append the length of the string using variable byte encoding
i := binary.PutUvarint(b, uint64(len(s)))
e.bytes = append(e.bytes, b[:i]...)
// Append the string bytes
e.bytes = append(e.bytes, s...)
}
// Bytes returns a copy of the underlying buffer.
func (e *StringEncoder) Bytes() ([]byte, error) {
// Compress the currently appended bytes using snappy and prefix with
// a 1 byte header for future extension
data := snappy.Encode(nil, e.bytes)
return append([]byte{stringCompressedSnappy << 4}, data...), nil
}
// StringDecoder decodes a byte slice into strings.
type StringDecoder struct {
b []byte
l int
i int
err error
}
// SetBytes initializes the decoder with bytes to read from.
// This must be called before calling any other method.
func (e *StringDecoder) SetBytes(b []byte) error {
// First byte stores the encoding type, only have snappy format
// currently so ignore for now.
var data []byte
if len(b) > 0 {
var err error
data, err = snappy.Decode(nil, b[1:])
if err != nil {
return fmt.Errorf("failed to decode string block: %v", err.Error())
}
}
e.b = data
e.l = 0
e.i = 0
e.err = nil
return nil
}
// Next returns true if there are any values remaining to be decoded.
func (e *StringDecoder) Next() bool {
if e.err != nil {
return false
}
e.i += e.l
return e.i < len(e.b)
}
// Read returns the next value from the decoder.
func (e *StringDecoder) Read() string {
// Read the length of the string
length, n := binary.Uvarint(e.b[e.i:])
if n <= 0 {
e.err = fmt.Errorf("StringDecoder: invalid encoded string length")
return ""
}
// The length of this string plus the length of the variable byte encoded length
e.l = int(length) + n
lower := e.i + n
upper := lower + int(length)
if upper < lower {
e.err = fmt.Errorf("StringDecoder: length overflow")
return ""
}
if upper > len(e.b) {
e.err = fmt.Errorf("StringDecoder: not enough data to represent encoded string")
return ""
}
return string(e.b[lower:upper])
}
// Error returns the last error encountered by the decoder.
func (e *StringDecoder) Error() error {
return e.err
}

View File

@@ -0,0 +1,177 @@
package tsm1
import (
"fmt"
"reflect"
"testing"
"testing/quick"
)
func Test_StringEncoder_NoValues(t *testing.T) {
enc := NewStringEncoder(1024)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec StringDecoder
if err := dec.SetBytes(b); err != nil {
t.Fatalf("unexpected error creating string decoder: %v", err)
}
if dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
}
func Test_StringEncoder_Single(t *testing.T) {
enc := NewStringEncoder(1024)
v1 := "v1"
enc.Write(v1)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec StringDecoder
if dec.SetBytes(b); err != nil {
t.Fatalf("unexpected error creating string decoder: %v", err)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got false, exp true")
}
if v1 != dec.Read() {
t.Fatalf("unexpected value: got %v, exp %v", dec.Read(), v1)
}
}
func Test_StringEncoder_Multi_Compressed(t *testing.T) {
enc := NewStringEncoder(1024)
values := make([]string, 10)
for i := range values {
values[i] = fmt.Sprintf("value %d", i)
enc.Write(values[i])
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if b[0]>>4 != stringCompressedSnappy {
t.Fatalf("unexpected encoding: got %v, exp %v", b[0], stringCompressedSnappy)
}
if exp := 51; len(b) != exp {
t.Fatalf("unexpected length: got %v, exp %v", len(b), exp)
}
var dec StringDecoder
if err := dec.SetBytes(b); err != nil {
t.Fatalf("unexpected erorr creating string decoder: %v", err)
}
for i, v := range values {
if !dec.Next() {
t.Fatalf("unexpected next value: got false, exp true")
}
if v != dec.Read() {
t.Fatalf("unexpected value at pos %d: got %v, exp %v", i, dec.Read(), v)
}
}
if dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
}
func Test_StringEncoder_Quick(t *testing.T) {
quick.Check(func(values []string) bool {
expected := values
if values == nil {
expected = []string{}
}
// Write values to encoder.
enc := NewStringEncoder(1024)
for _, v := range values {
enc.Write(v)
}
// Retrieve encoded bytes from encoder.
buf, err := enc.Bytes()
if err != nil {
t.Fatal(err)
}
// Read values out of decoder.
got := make([]string, 0, len(values))
var dec StringDecoder
if err := dec.SetBytes(buf); err != nil {
t.Fatal(err)
}
for dec.Next() {
if err := dec.Error(); err != nil {
t.Fatal(err)
}
got = append(got, dec.Read())
}
// Verify that input and output values match.
if !reflect.DeepEqual(expected, got) {
t.Fatalf("mismatch:\n\nexp=%#v\n\ngot=%#v\n\n", expected, got)
}
return true
}, nil)
}
func Test_StringDecoder_Empty(t *testing.T) {
var dec StringDecoder
if err := dec.SetBytes([]byte{}); err != nil {
t.Fatal(err)
}
if dec.Next() {
t.Fatalf("exp Next() == false, got true")
}
}
func Test_StringDecoder_CorruptRead(t *testing.T) {
cases := []string{
"\x10\x03\b\x03Hi", // Higher length than actual data
"\x10\x1dp\x9c\x90\x90\x90\x90\x90\x90\x90\x90\x90length overflow----",
}
for _, c := range cases {
var dec StringDecoder
if err := dec.SetBytes([]byte(c)); err != nil {
t.Fatal(err)
}
if !dec.Next() {
t.Fatalf("exp Next() to return true, got false")
}
_ = dec.Read()
if dec.Error() == nil {
t.Fatalf("exp an err, got nil: %q", c)
}
}
}
func Test_StringDecoder_CorruptSetBytes(t *testing.T) {
cases := []string{
"0t\x00\x01\x000\x00\x01\x000\x00\x01\x000\x00\x01\x000\x00\x01" +
"\x000\x00\x01\x000\x00\x01\x000\x00\x00\x00\xff:\x01\x00\x01\x00\x01" +
"\x00\x01\x00\x01\x00\x01\x00\x010\x010\x000\x010\x010\x010\x01" +
"0\x010\x010\x010\x010\x010\x010\x010\x010\x010\x010", // Upper slice bounds overflows negative
}
for _, c := range cases {
var dec StringDecoder
if err := dec.SetBytes([]byte(c)); err == nil {
t.Fatalf("exp an err, got nil: %q", c)
}
}
}

View File

@@ -0,0 +1,414 @@
package tsm1
// Timestamp encoding is adaptive and based on structure of the timestamps that are encoded. It
// uses a combination of delta encoding, scaling and compression using simple8b, run length encoding
// as well as falling back to no compression if needed.
//
// Timestamp values to be encoded should be sorted before encoding. When encoded, the values are
// first delta-encoded. The first value is the starting timestamp, subsequent values are the difference
// from the prior value.
//
// Timestamp resolution can also be in the nanosecond. Many timestamps are monotonically increasing
// and fall on even boundaries of time such as every 10s. When the timestamps have this structure,
// they are scaled by the largest common divisor that is also a factor of 10. This has the effect
// of converting very large integer deltas into very small one that can be reversed by multiplying them
// by the scaling factor.
//
// Using these adjusted values, if all the deltas are the same, the time range is stored using run
// length encoding. If run length encoding is not possible and all values are less than 1 << 60 - 1
// (~36.5 yrs in nanosecond resolution), then the timestamps are encoded using simple8b encoding. If
// any value exceeds the maximum values, the deltas are stored uncompressed using 8b each.
//
// Each compressed byte slice has a 1 byte header indicating the compression type. The 4 high bits
// indicate the encoding type. The 4 low bits are used by the encoding type.
//
// For run-length encoding, the 4 low bits store the log10 of the scaling factor. The next 8 bytes are
// the starting timestamp, next 1-10 bytes is the delta value using variable-length encoding, finally the
// next 1-10 bytes is the count of values.
//
// For simple8b encoding, the 4 low bits store the log10 of the scaling factor. The next 8 bytes is the
// first delta value stored uncompressed, the remaining bytes are 64bit words containg compressed delta
// values.
//
// For uncompressed encoding, the delta values are stored using 8 bytes each.
import (
"encoding/binary"
"fmt"
"math"
"github.com/jwilder/encoding/simple8b"
)
const (
// timeUncompressed is a an uncompressed format using 8 bytes per timestamp
timeUncompressed = 0
// timeCompressedPackedSimple is a bit-packed format using simple8b encoding
timeCompressedPackedSimple = 1
// timeCompressedRLE is a run-length encoding format
timeCompressedRLE = 2
)
// TimeEncoder encodes time.Time to byte slices.
type TimeEncoder interface {
Write(t int64)
Bytes() ([]byte, error)
Reset()
}
type encoder struct {
ts []uint64
bytes []byte
enc *simple8b.Encoder
}
// NewTimeEncoder returns a TimeEncoder with an initial buffer ready to hold sz bytes.
func NewTimeEncoder(sz int) TimeEncoder {
return &encoder{
ts: make([]uint64, 0, sz),
enc: simple8b.NewEncoder(),
}
}
// Reset sets the encoder back to its initial state.
func (e *encoder) Reset() {
e.ts = e.ts[:0]
e.bytes = e.bytes[:0]
e.enc.Reset()
}
// Write adds a timestamp to the compressed stream.
func (e *encoder) Write(t int64) {
e.ts = append(e.ts, uint64(t))
}
func (e *encoder) reduce() (max, divisor uint64, rle bool, deltas []uint64) {
// Compute the deltas in place to avoid allocating another slice
deltas = e.ts
// Starting values for a max and divisor
max, divisor = 0, 1e12
// Indicates whether the the deltas can be run-length encoded
rle = true
// Iterate in reverse so we can apply deltas in place
for i := len(deltas) - 1; i > 0; i-- {
// First differential encode the values
deltas[i] = deltas[i] - deltas[i-1]
// We also need to keep track of the max value and largest common divisor
v := deltas[i]
if v > max {
max = v
}
// If our value is divisible by 10, break. Otherwise, try the next smallest divisor.
for divisor > 1 && v%divisor != 0 {
divisor /= 10
}
// Skip the first value || see if prev = curr. The deltas can be RLE if the are all equal.
rle = i == len(deltas)-1 || rle && (deltas[i+1] == deltas[i])
}
return
}
// Bytes returns the encoded bytes of all written times.
func (e *encoder) Bytes() ([]byte, error) {
if len(e.ts) == 0 {
return e.bytes[:0], nil
}
// Maximum and largest common divisor. rle is true if dts (the delta timestamps),
// are all the same.
max, div, rle, dts := e.reduce()
// The deltas are all the same, so we can run-length encode them
if rle && len(e.ts) > 1 {
return e.encodeRLE(e.ts[0], e.ts[1], div, len(e.ts))
}
// We can't compress this time-range, the deltas exceed 1 << 60
if max > simple8b.MaxValue {
return e.encodeRaw()
}
return e.encodePacked(div, dts)
}
func (e *encoder) encodePacked(div uint64, dts []uint64) ([]byte, error) {
// Only apply the divisor if it's greater than 1 since division is expensive.
if div > 1 {
for _, v := range dts[1:] {
if err := e.enc.Write(v / div); err != nil {
return nil, err
}
}
} else {
for _, v := range dts[1:] {
if err := e.enc.Write(v); err != nil {
return nil, err
}
}
}
// The compressed deltas
deltas, err := e.enc.Bytes()
if err != nil {
return nil, err
}
sz := 8 + 1 + len(deltas)
if cap(e.bytes) < sz {
e.bytes = make([]byte, sz)
}
b := e.bytes[:sz]
// 4 high bits used for the encoding type
b[0] = byte(timeCompressedPackedSimple) << 4
// 4 low bits are the log10 divisor
b[0] |= byte(math.Log10(float64(div)))
// The first delta value
binary.BigEndian.PutUint64(b[1:9], uint64(dts[0]))
copy(b[9:], deltas)
return b[:9+len(deltas)], nil
}
func (e *encoder) encodeRaw() ([]byte, error) {
sz := 1 + len(e.ts)*8
if cap(e.bytes) < sz {
e.bytes = make([]byte, sz)
}
b := e.bytes[:sz]
b[0] = byte(timeUncompressed) << 4
for i, v := range e.ts {
binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], uint64(v))
}
return b, nil
}
func (e *encoder) encodeRLE(first, delta, div uint64, n int) ([]byte, error) {
// Large varints can take up to 10 bytes, we're encoding 3 + 1 byte type
sz := 31
if cap(e.bytes) < sz {
e.bytes = make([]byte, sz)
}
b := e.bytes[:sz]
// 4 high bits used for the encoding type
b[0] = byte(timeCompressedRLE) << 4
// 4 low bits are the log10 divisor
b[0] |= byte(math.Log10(float64(div)))
i := 1
// The first timestamp
binary.BigEndian.PutUint64(b[i:], uint64(first))
i += 8
// The first delta
i += binary.PutUvarint(b[i:], uint64(delta/div))
// The number of times the delta is repeated
i += binary.PutUvarint(b[i:], uint64(n))
return b[:i], nil
}
// TimeDecoder decodes a byte slice into timestamps.
type TimeDecoder struct {
v int64
i, n int
ts []uint64
dec simple8b.Decoder
err error
// The delta value for a run-length encoded byte slice
rleDelta int64
encoding byte
}
// Init initializes the decoder with bytes to read from.
func (d *TimeDecoder) Init(b []byte) {
d.v = 0
d.i = 0
d.ts = d.ts[:0]
d.err = nil
if len(b) > 0 {
// Encoding type is stored in the 4 high bits of the first byte
d.encoding = b[0] >> 4
}
d.decode(b)
}
// Next returns true if there are any timestamps remaining to be decoded.
func (d *TimeDecoder) Next() bool {
if d.err != nil {
return false
}
if d.encoding == timeCompressedRLE {
if d.i >= d.n {
return false
}
d.i++
d.v += d.rleDelta
return d.i < d.n
}
if d.i >= len(d.ts) {
return false
}
d.v = int64(d.ts[d.i])
d.i++
return true
}
// Read returns the next timestamp from the decoder.
func (d *TimeDecoder) Read() int64 {
return d.v
}
// Error returns the last error encountered by the decoder.
func (d *TimeDecoder) Error() error {
return d.err
}
func (d *TimeDecoder) decode(b []byte) {
if len(b) == 0 {
return
}
switch d.encoding {
case timeUncompressed:
d.decodeRaw(b[1:])
case timeCompressedRLE:
d.decodeRLE(b)
case timeCompressedPackedSimple:
d.decodePacked(b)
default:
d.err = fmt.Errorf("unknown encoding: %v", d.encoding)
}
}
func (d *TimeDecoder) decodePacked(b []byte) {
if len(b) < 9 {
d.err = fmt.Errorf("TimeDecoder: not enough data to decode packed timestamps")
return
}
div := uint64(math.Pow10(int(b[0] & 0xF)))
first := uint64(binary.BigEndian.Uint64(b[1:9]))
d.dec.SetBytes(b[9:])
d.i = 0
deltas := d.ts[:0]
deltas = append(deltas, first)
for d.dec.Next() {
deltas = append(deltas, d.dec.Read())
}
// Compute the prefix sum and scale the deltas back up
last := deltas[0]
if div > 1 {
for i := 1; i < len(deltas); i++ {
dgap := deltas[i] * div
deltas[i] = last + dgap
last = deltas[i]
}
} else {
for i := 1; i < len(deltas); i++ {
deltas[i] += last
last = deltas[i]
}
}
d.i = 0
d.ts = deltas
}
func (d *TimeDecoder) decodeRLE(b []byte) {
if len(b) < 9 {
d.err = fmt.Errorf("TimeDecoder: not enough data for initial RLE timestamp")
return
}
var i, n int
// Lower 4 bits hold the 10 based exponent so we can scale the values back up
mod := int64(math.Pow10(int(b[i] & 0xF)))
i++
// Next 8 bytes is the starting timestamp
first := binary.BigEndian.Uint64(b[i : i+8])
i += 8
// Next 1-10 bytes is our (scaled down by factor of 10) run length values
value, n := binary.Uvarint(b[i:])
if n <= 0 {
d.err = fmt.Errorf("TimeDecoder: invalid run length in decodeRLE")
return
}
// Scale the value back up
value *= uint64(mod)
i += n
// Last 1-10 bytes is how many times the value repeats
count, n := binary.Uvarint(b[i:])
if n <= 0 {
d.err = fmt.Errorf("TimeDecoder: invalid repeat value in decodeRLE")
return
}
d.v = int64(first - value)
d.rleDelta = int64(value)
d.i = -1
d.n = int(count)
}
func (d *TimeDecoder) decodeRaw(b []byte) {
d.i = 0
d.ts = make([]uint64, len(b)/8)
for i := range d.ts {
d.ts[i] = binary.BigEndian.Uint64(b[i*8 : i*8+8])
delta := d.ts[i]
// Compute the prefix sum and scale the deltas back up
if i > 0 {
d.ts[i] = d.ts[i-1] + delta
}
}
}
func CountTimestamps(b []byte) int {
if len(b) == 0 {
return 0
}
// Encoding type is stored in the 4 high bits of the first byte
encoding := b[0] >> 4
switch encoding {
case timeUncompressed:
// Uncompressed timestamps are just 8 bytes each
return len(b[1:]) / 8
case timeCompressedRLE:
// First 9 bytes are the starting timestamp and scaling factor, skip over them
i := 9
// Next 1-10 bytes is our (scaled down by factor of 10) run length values
_, n := binary.Uvarint(b[9:])
i += n
// Last 1-10 bytes is how many times the value repeats
count, _ := binary.Uvarint(b[i:])
return int(count)
case timeCompressedPackedSimple:
// First 9 bytes are the starting timestamp and scaling factor, skip over them
count, _ := simple8b.CountBytes(b[9:])
return count + 1 // +1 is for the first uncompressed timestamp, starting timestamep in b[1:9]
default:
return 0
}
}

View File

@@ -0,0 +1,604 @@
package tsm1
import (
"reflect"
"testing"
"testing/quick"
"time"
)
func Test_TimeEncoder(t *testing.T) {
enc := NewTimeEncoder(1)
x := []int64{}
now := time.Unix(0, 0)
x = append(x, now.UnixNano())
enc.Write(now.UnixNano())
for i := 1; i < 4; i++ {
x = append(x, now.Add(time.Duration(i)*time.Second).UnixNano())
enc.Write(x[i])
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
for i, v := range x {
if !dec.Next() {
t.Fatalf("Next == false, expected true")
}
if v != dec.Read() {
t.Fatalf("Item %d mismatch, got %v, exp %v", i, dec.Read(), v)
}
}
}
func Test_TimeEncoder_NoValues(t *testing.T) {
enc := NewTimeEncoder(0)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec TimeDecoder
dec.Init(b)
if dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
}
func Test_TimeEncoder_One(t *testing.T) {
enc := NewTimeEncoder(1)
var tm int64
enc.Write(tm)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeCompressedPackedSimple {
t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if tm != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), tm)
}
}
func Test_TimeEncoder_Two(t *testing.T) {
enc := NewTimeEncoder(2)
t1 := int64(0)
t2 := int64(1)
enc.Write(t1)
enc.Write(t2)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2)
}
}
func Test_TimeEncoder_Three(t *testing.T) {
enc := NewTimeEncoder(3)
t1 := int64(0)
t2 := int64(1)
t3 := int64(3)
enc.Write(t1)
enc.Write(t2)
enc.Write(t3)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeCompressedPackedSimple {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t3 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t3)
}
}
func Test_TimeEncoder_Large_Range(t *testing.T) {
enc := NewTimeEncoder(2)
t1 := int64(1442369134000000000)
t2 := int64(1442369135000000000)
enc.Write(t1)
enc.Write(t2)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2)
}
}
func Test_TimeEncoder_Uncompressed(t *testing.T) {
enc := NewTimeEncoder(3)
t1 := time.Unix(0, 0).UnixNano()
t2 := time.Unix(1, 0).UnixNano()
// about 36.5yrs in NS resolution is max range for compressed format
// This should cause the encoding to fallback to raw points
t3 := time.Unix(2, (2 << 59)).UnixNano()
enc.Write(t1)
enc.Write(t2)
enc.Write(t3)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("expected error: %v", err)
}
if exp := 25; len(b) != exp {
t.Fatalf("length mismatch: got %v, exp %v", len(b), exp)
}
if got := b[0] >> 4; got != timeUncompressed {
t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t1 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t1)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t2 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t2)
}
if !dec.Next() {
t.Fatalf("unexpected next value: got true, exp false")
}
if t3 != dec.Read() {
t.Fatalf("read value mismatch: got %v, exp %v", dec.Read(), t3)
}
}
func Test_TimeEncoder_RLE(t *testing.T) {
enc := NewTimeEncoder(512)
var ts []int64
for i := 0; i < 500; i++ {
ts = append(ts, int64(i))
}
for _, v := range ts {
enc.Write(v)
}
b, err := enc.Bytes()
if exp := 12; len(b) != exp {
t.Fatalf("length mismatch: got %v, exp %v", len(b), exp)
}
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got)
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec TimeDecoder
dec.Init(b)
for i, v := range ts {
if !dec.Next() {
t.Fatalf("Next == false, expected true")
}
if v != dec.Read() {
t.Fatalf("Item %d mismatch, got %v, exp %v", i, dec.Read(), v)
}
}
if dec.Next() {
t.Fatalf("unexpected extra values")
}
}
func Test_TimeEncoder_Reverse(t *testing.T) {
enc := NewTimeEncoder(3)
ts := []int64{
int64(3),
int64(2),
int64(0),
}
for _, v := range ts {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeUncompressed {
t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
i := 0
for dec.Next() {
if ts[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), ts[i])
}
i++
}
}
func Test_TimeEncoder_220SecondDelta(t *testing.T) {
enc := NewTimeEncoder(256)
var ts []int64
now := time.Now()
for i := 0; i < 220; i++ {
ts = append(ts, now.Add(time.Duration(i*60)*time.Second).UnixNano())
}
for _, v := range ts {
enc.Write(v)
}
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// Using RLE, should get 12 bytes
if exp := 12; len(b) != exp {
t.Fatalf("unexpected length: got %v, exp %v", len(b), exp)
}
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected uncompressed, got %v", got)
}
var dec TimeDecoder
dec.Init(b)
i := 0
for dec.Next() {
if ts[i] != dec.Read() {
t.Fatalf("read value %d mismatch: got %v, exp %v", i, dec.Read(), ts[i])
}
i++
}
if i != len(ts) {
t.Fatalf("Read too few values: exp %d, got %d", len(ts), i)
}
if dec.Next() {
t.Fatalf("expecte Next() = false, got true")
}
}
func Test_TimeEncoder_Quick(t *testing.T) {
quick.Check(func(values []int64) bool {
// Write values to encoder.
enc := NewTimeEncoder(1024)
exp := make([]int64, len(values))
for i, v := range values {
exp[i] = int64(v)
enc.Write(exp[i])
}
// Retrieve encoded bytes from encoder.
buf, err := enc.Bytes()
if err != nil {
t.Fatal(err)
}
// Read values out of decoder.
got := make([]int64, 0, len(values))
var dec TimeDecoder
dec.Init(buf)
for dec.Next() {
if err := dec.Error(); err != nil {
t.Fatal(err)
}
got = append(got, dec.Read())
}
// Verify that input and output values match.
if !reflect.DeepEqual(exp, got) {
t.Fatalf("mismatch:\n\nexp=%+v\n\ngot=%+v\n\n", exp, got)
}
return true
}, nil)
}
func Test_TimeEncoder_RLESeconds(t *testing.T) {
enc := NewTimeEncoder(6)
ts := make([]int64, 6)
ts[0] = int64(1444448158000000000)
ts[1] = int64(1444448168000000000)
ts[2] = int64(1444448178000000000)
ts[3] = int64(1444448188000000000)
ts[4] = int64(1444448198000000000)
ts[5] = int64(1444448208000000000)
for _, v := range ts {
enc.Write(v)
}
b, err := enc.Bytes()
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var dec TimeDecoder
dec.Init(b)
for i, v := range ts {
if !dec.Next() {
t.Fatalf("Next == false, expected true")
}
if v != dec.Read() {
t.Fatalf("Item %d mismatch, got %v, exp %v", i, dec.Read(), v)
}
}
if dec.Next() {
t.Fatalf("unexpected extra values")
}
}
func TestTimeEncoder_Count_Uncompressed(t *testing.T) {
enc := NewTimeEncoder(2)
t1 := time.Unix(0, 0).UnixNano()
t2 := time.Unix(1, 0).UnixNano()
// about 36.5yrs in NS resolution is max range for compressed format
// This should cause the encoding to fallback to raw points
t3 := time.Unix(2, (2 << 59)).UnixNano()
enc.Write(t1)
enc.Write(t2)
enc.Write(t3)
b, err := enc.Bytes()
if got := b[0] >> 4; got != timeUncompressed {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got, exp := CountTimestamps(b), 3; got != exp {
t.Fatalf("count mismatch: got %v, exp %v", got, exp)
}
}
func TestTimeEncoder_Count_RLE(t *testing.T) {
enc := NewTimeEncoder(5)
ts := make([]int64, 6)
ts[0] = int64(1444448158000000000)
ts[1] = int64(1444448168000000000)
ts[2] = int64(1444448178000000000)
ts[3] = int64(1444448188000000000)
ts[4] = int64(1444448198000000000)
ts[5] = int64(1444448208000000000)
for _, v := range ts {
enc.Write(v)
}
b, err := enc.Bytes()
if got := b[0] >> 4; got != timeCompressedRLE {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got, exp := CountTimestamps(b), len(ts); got != exp {
t.Fatalf("count mismatch: got %v, exp %v", got, exp)
}
}
func TestTimeEncoder_Count_Simple8(t *testing.T) {
enc := NewTimeEncoder(3)
t1 := int64(0)
t2 := int64(1)
t3 := int64(3)
enc.Write(t1)
enc.Write(t2)
enc.Write(t3)
b, err := enc.Bytes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got := b[0] >> 4; got != timeCompressedPackedSimple {
t.Fatalf("Wrong encoding used: expected rle, got %v", got)
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got, exp := CountTimestamps(b), 3; got != exp {
t.Fatalf("count mismatch: got %v, exp %v", got, exp)
}
}
func TestTimeDecoder_Corrupt(t *testing.T) {
cases := []string{
"", // Empty
"\x10\x14", // Packed: not enough data
"\x20\x00", // RLE: not enough data for starting timestamp
"\x2012345678\x90", // RLE: initial timestamp but invalid uvarint encoding
"\x2012345678\x7f", // RLE: timestamp, RLE but invalid repeat
"\x00123", // Raw: data length not multiple of 8
}
for _, c := range cases {
var dec TimeDecoder
dec.Init([]byte(c))
if dec.Next() {
t.Fatalf("exp next == false, got true")
}
}
}
func BenchmarkTimeEncoder(b *testing.B) {
enc := NewTimeEncoder(1024)
x := make([]int64, 1024)
for i := 0; i < len(x); i++ {
x[i] = time.Now().UnixNano()
enc.Write(x[i])
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
enc.Bytes()
enc.Reset()
for i := 0; i < len(x); i++ {
enc.Write(x[i])
}
}
}
func BenchmarkTimeDecoder_Packed(b *testing.B) {
x := make([]int64, 1024)
enc := NewTimeEncoder(1024)
for i := 0; i < len(x); i++ {
x[i] = time.Now().UnixNano()
enc.Write(x[i])
}
bytes, _ := enc.Bytes()
b.ResetTimer()
var dec TimeDecoder
for i := 0; i < b.N; i++ {
dec.Init(bytes)
for dec.Next() {
}
}
}
func BenchmarkTimeDecoder_RLE(b *testing.B) {
x := make([]int64, 1024)
enc := NewTimeEncoder(1024)
for i := 0; i < len(x); i++ {
x[i] = int64(i * 10)
enc.Write(x[i])
}
bytes, _ := enc.Bytes()
b.ResetTimer()
b.StopTimer()
var dec TimeDecoder
b.StartTimer()
for i := 0; i < b.N; i++ {
dec.Init(bytes)
for dec.Next() {
}
}
}

View File

@@ -0,0 +1,342 @@
package tsm1
import (
"bufio"
"encoding/binary"
"io"
"io/ioutil"
"math"
"os"
"path/filepath"
"strings"
"sync"
)
const (
v2header = 0x1502
v2headerSize = 4
)
// Tombstoner records tombstones when entries are deleted.
type Tombstoner struct {
mu sync.RWMutex
// Path is the location of the file to record tombstone. This should be the
// full path to a TSM file.
Path string
// cache of the stats for this tombstone
fileStats []FileStat
// indicates that the stats may be out of sync with what is on disk and they
// should be refreshed.
statsLoaded bool
}
// Tombstone represents an individual deletion.
type Tombstone struct {
// Key is the tombstoned series key.
Key string
// Min and Max are the min and max unix nanosecond time ranges of Key that are deleted. If
// the full range is deleted, both values are -1.
Min, Max int64
}
// Add adds the all keys, across all timestamps, to the tombstone.
func (t *Tombstoner) Add(keys []string) error {
return t.AddRange(keys, math.MinInt64, math.MaxInt64)
}
// AddRange adds all keys to the tombstone specifying only the data between min and max to be removed.
func (t *Tombstoner) AddRange(keys []string, min, max int64) error {
if len(keys) == 0 {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
// If this TSMFile has not been written (mainly in tests), don't write a
// tombstone because the keys will not be written when it's actually saved.
if t.Path == "" {
return nil
}
t.statsLoaded = false
tombstones, err := t.readTombstone()
if err != nil {
return nil
}
if cap(tombstones) < len(tombstones)+len(keys) {
ts := make([]Tombstone, len(tombstones), len(tombstones)+len(keys))
copy(ts, tombstones)
tombstones = ts
}
for _, k := range keys {
tombstones = append(tombstones, Tombstone{
Key: k,
Min: min,
Max: max,
})
}
return t.writeTombstone(tombstones)
}
// ReadAll returns all the tombstones in the Tombstoner's directory.
func (t *Tombstoner) ReadAll() ([]Tombstone, error) {
return t.readTombstone()
}
// Delete removes all the tombstone files from disk.
func (t *Tombstoner) Delete() error {
t.mu.Lock()
defer t.mu.Unlock()
if err := os.RemoveAll(t.tombstonePath()); err != nil {
return err
}
t.statsLoaded = false
return nil
}
// HasTombstones return true if there are any tombstone entries recorded.
func (t *Tombstoner) HasTombstones() bool {
files := t.TombstoneFiles()
return len(files) > 0 && files[0].Size > 0
}
// TombstoneFiles returns any tombstone files associated with Tombstoner's TSM file.
func (t *Tombstoner) TombstoneFiles() []FileStat {
t.mu.RLock()
if t.statsLoaded {
stats := t.fileStats
t.mu.RUnlock()
return stats
}
t.mu.RUnlock()
stat, err := os.Stat(t.tombstonePath())
if os.IsNotExist(err) || err != nil {
t.mu.Lock()
// The file doesn't exist so record that we tried to load it so
// we don't continue to keep trying. This is the common case.
t.statsLoaded = os.IsNotExist(err)
t.fileStats = t.fileStats[:0]
t.mu.Unlock()
return nil
}
t.mu.Lock()
t.fileStats = append(t.fileStats[:0], FileStat{
Path: t.tombstonePath(),
LastModified: stat.ModTime().UnixNano(),
Size: uint32(stat.Size()),
})
t.statsLoaded = true
stats := t.fileStats
t.mu.Unlock()
return stats
}
// Walk calls fn for every Tombstone under the Tombstoner.
func (t *Tombstoner) Walk(fn func(t Tombstone) error) error {
f, err := os.Open(t.tombstonePath())
if os.IsNotExist(err) {
return nil
} else if err != nil {
return err
}
defer f.Close()
var b [4]byte
if _, err := f.Read(b[:]); err != nil {
// Might be a zero length file which should not exist, but
// an old bug allowed them to occur. Treat it as an empty
// v1 tombstone file so we don't abort loading the TSM file.
return t.readTombstoneV1(f, fn)
}
if _, err := f.Seek(0, io.SeekStart); err != nil {
return err
}
if binary.BigEndian.Uint32(b[:]) == v2header {
return t.readTombstoneV2(f, fn)
}
return t.readTombstoneV1(f, fn)
}
func (t *Tombstoner) writeTombstone(tombstones []Tombstone) error {
tmp, err := ioutil.TempFile(filepath.Dir(t.Path), "tombstone")
if err != nil {
return err
}
defer tmp.Close()
var b [8]byte
bw := bufio.NewWriterSize(tmp, 1024*1024)
binary.BigEndian.PutUint32(b[:4], v2header)
if _, err := bw.Write(b[:4]); err != nil {
return err
}
for _, t := range tombstones {
binary.BigEndian.PutUint32(b[:4], uint32(len(t.Key)))
if _, err := bw.Write(b[:4]); err != nil {
return err
}
if _, err := bw.WriteString(t.Key); err != nil {
return err
}
binary.BigEndian.PutUint64(b[:], uint64(t.Min))
if _, err := bw.Write(b[:]); err != nil {
return err
}
binary.BigEndian.PutUint64(b[:], uint64(t.Max))
if _, err := bw.Write(b[:]); err != nil {
return err
}
}
if err := bw.Flush(); err != nil {
return err
}
// fsync the file to flush the write
if err := tmp.Sync(); err != nil {
return err
}
tmpFilename := tmp.Name()
tmp.Close()
if err := renameFile(tmpFilename, t.tombstonePath()); err != nil {
return err
}
return syncDir(filepath.Dir(t.tombstonePath()))
}
func (t *Tombstoner) readTombstone() ([]Tombstone, error) {
var tombstones []Tombstone
if err := t.Walk(func(t Tombstone) error {
tombstones = append(tombstones, t)
return nil
}); err != nil {
return nil, err
}
return tombstones, nil
}
// readTombstoneV1 reads the first version of tombstone files that were not
// capable of storing a min and max time for a key. This is used for backwards
// compatibility with versions prior to 0.13. This format is a simple newline
// separated text file.
func (t *Tombstoner) readTombstoneV1(f *os.File, fn func(t Tombstone) error) error {
r := bufio.NewScanner(f)
for r.Scan() {
line := r.Text()
if line == "" {
continue
}
if err := fn(Tombstone{
Key: line,
Min: math.MinInt64,
Max: math.MaxInt64,
}); err != nil {
return err
}
}
return r.Err()
}
// readTombstoneV2 reads the second version of tombstone files that are capable
// of storing keys and the range of time for the key that points were deleted. This
// format is binary.
func (t *Tombstoner) readTombstoneV2(f *os.File, fn func(t Tombstone) error) error {
// Skip header, already checked earlier
if _, err := f.Seek(v2headerSize, io.SeekStart); err != nil {
return err
}
n := int64(4)
fi, err := f.Stat()
if err != nil {
return err
}
size := fi.Size()
var (
min, max int64
key string
)
b := make([]byte, 4096)
for {
if n >= size {
return nil
}
if _, err = f.Read(b[:4]); err != nil {
return err
}
n += 4
keyLen := int(binary.BigEndian.Uint32(b[:4]))
if keyLen > len(b) {
b = make([]byte, keyLen)
}
if _, err := f.Read(b[:keyLen]); err != nil {
return err
}
key = string(b[:keyLen])
n += int64(keyLen)
if _, err := f.Read(b[:8]); err != nil {
return err
}
n += 8
min = int64(binary.BigEndian.Uint64(b[:8]))
if _, err := f.Read(b[:8]); err != nil {
return err
}
n += 8
max = int64(binary.BigEndian.Uint64(b[:8]))
if err := fn(Tombstone{
Key: key,
Min: min,
Max: max,
}); err != nil {
return err
}
}
}
func (t *Tombstoner) tombstonePath() string {
if strings.HasSuffix(t.Path, "tombstone") {
return t.Path
}
// Filename is 0000001.tsm1
filename := filepath.Base(t.Path)
// Strip off the tsm1
ext := filepath.Ext(filename)
if ext != "" {
filename = strings.TrimSuffix(filename, ext)
}
// Append the "tombstone" suffix to create a 0000001.tombstone file
return filepath.Join(filepath.Dir(t.Path), filename+".tombstone")
}

View File

@@ -0,0 +1,236 @@
package tsm1_test
import (
"io/ioutil"
"os"
"testing"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
)
func TestTombstoner_Add(t *testing.T) {
dir := MustTempDir()
defer func() { os.RemoveAll(dir) }()
f := MustTempFile(dir)
ts := &tsm1.Tombstoner{Path: f.Name()}
entries, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 0; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
stats := ts.TombstoneFiles()
if got, exp := len(stats), 0; got != exp {
t.Fatalf("stat length mismatch: got %v, exp %v", got, exp)
}
ts.Add([]string{"foo"})
entries, err = ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
stats = ts.TombstoneFiles()
if got, exp := len(stats), 1; got != exp {
t.Fatalf("stat length mismatch: got %v, exp %v", got, exp)
}
if stats[0].Size == 0 {
t.Fatalf("got size %v, exp > 0", stats[0].Size)
}
if stats[0].LastModified == 0 {
t.Fatalf("got lastModified %v, exp > 0", stats[0].LastModified)
}
if stats[0].Path == "" {
t.Fatalf("got path %v, exp != ''", stats[0].Path)
}
if got, exp := len(entries), 1; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
if got, exp := entries[0].Key, "foo"; got != exp {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
// Use a new Tombstoner to verify values are persisted
ts = &tsm1.Tombstoner{Path: f.Name()}
entries, err = ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 1; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
if got, exp := entries[0].Key, "foo"; got != exp {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
}
func TestTombstoner_Add_Empty(t *testing.T) {
dir := MustTempDir()
defer func() { os.RemoveAll(dir) }()
f := MustTempFile(dir)
ts := &tsm1.Tombstoner{Path: f.Name()}
entries, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 0; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
ts.Add([]string{})
// Use a new Tombstoner to verify values are persisted
ts = &tsm1.Tombstoner{Path: f.Name()}
entries, err = ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 0; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
stats := ts.TombstoneFiles()
if got, exp := len(stats), 0; got != exp {
t.Fatalf("stat length mismatch: got %v, exp %v", got, exp)
}
}
func TestTombstoner_Delete(t *testing.T) {
dir := MustTempDir()
defer func() { os.RemoveAll(dir) }()
f := MustTempFile(dir)
ts := &tsm1.Tombstoner{Path: f.Name()}
ts.Add([]string{"foo"})
// Use a new Tombstoner to verify values are persisted
ts = &tsm1.Tombstoner{Path: f.Name()}
entries, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 1; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
if got, exp := entries[0].Key, "foo"; got != exp {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
if err := ts.Delete(); err != nil {
fatal(t, "delete tombstone", err)
}
stats := ts.TombstoneFiles()
if got, exp := len(stats), 0; got != exp {
t.Fatalf("stat length mismatch: got %v, exp %v", got, exp)
}
ts = &tsm1.Tombstoner{Path: f.Name()}
entries, err = ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 0; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
}
func TestTombstoner_ReadV1(t *testing.T) {
dir := MustTempDir()
defer func() { os.RemoveAll(dir) }()
f := MustTempFile(dir)
if err := ioutil.WriteFile(f.Name(), []byte("foo\n"), 0x0600); err != nil {
t.Fatalf("write v1 file: %v", err)
}
f.Close()
if err := os.Rename(f.Name(), f.Name()+".tombstone"); err != nil {
t.Fatalf("rename tombstone failed: %v", err)
}
ts := &tsm1.Tombstoner{Path: f.Name()}
_, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
entries, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 1; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
if got, exp := entries[0].Key, "foo"; got != exp {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
// Use a new Tombstoner to verify values are persisted
ts = &tsm1.Tombstoner{Path: f.Name()}
entries, err = ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 1; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
if got, exp := entries[0].Key, "foo"; got != exp {
t.Fatalf("value mismatch: got %v, exp %v", got, exp)
}
}
func TestTombstoner_ReadEmptyV1(t *testing.T) {
dir := MustTempDir()
defer func() { os.RemoveAll(dir) }()
f := MustTempFile(dir)
f.Close()
if err := os.Rename(f.Name(), f.Name()+".tombstone"); err != nil {
t.Fatalf("rename tombstone failed: %v", err)
}
ts := &tsm1.Tombstoner{Path: f.Name()}
_, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
entries, err := ts.ReadAll()
if err != nil {
fatal(t, "ReadAll", err)
}
if got, exp := len(entries), 0; got != exp {
t.Fatalf("length mismatch: got %v, exp %v", got, exp)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,768 @@
package tsm1_test
import (
"fmt"
"io"
"os"
"testing"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
"github.com/golang/snappy"
)
func TestWALWriter_WriteMulti_Single(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
p1 := tsm1.NewValue(1, 1.1)
p2 := tsm1.NewValue(1, int64(1))
p3 := tsm1.NewValue(1, true)
p4 := tsm1.NewValue(1, "string")
values := map[string][]tsm1.Value{
"cpu,host=A#!~#float": []tsm1.Value{p1},
"cpu,host=A#!~#int": []tsm1.Value{p2},
"cpu,host=A#!~#bool": []tsm1.Value{p3},
"cpu,host=A#!~#string": []tsm1.Value{p4},
}
entry := &tsm1.WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err := r.Read()
if err != nil {
fatal(t, "read entry", err)
}
e, ok := we.(*tsm1.WriteWALEntry)
if !ok {
t.Fatalf("expected WriteWALEntry: got %#v", e)
}
for k, v := range e.Values {
for i, vv := range v {
if got, exp := vv.String(), values[k][i].String(); got != exp {
t.Fatalf("points mismatch: got %v, exp %v", got, exp)
}
}
}
if n := r.Count(); n != MustReadFileSize(f) {
t.Fatalf("wrong count of bytes read, got %d, exp %d", n, MustReadFileSize(f))
}
}
func TestWALWriter_WriteMulti_LargeBatch(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
var points []tsm1.Value
for i := 0; i < 100000; i++ {
points = append(points, tsm1.NewValue(int64(i), int64(1)))
}
values := map[string][]tsm1.Value{
"cpu,host=A,server=01,foo=bar,tag=really-long#!~#float": points,
"mem,host=A,server=01,foo=bar,tag=really-long#!~#float": points,
}
entry := &tsm1.WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err := r.Read()
if err != nil {
fatal(t, "read entry", err)
}
e, ok := we.(*tsm1.WriteWALEntry)
if !ok {
t.Fatalf("expected WriteWALEntry: got %#v", e)
}
for k, v := range e.Values {
for i, vv := range v {
if got, exp := vv.String(), values[k][i].String(); got != exp {
t.Fatalf("points mismatch: got %v, exp %v", got, exp)
}
}
}
if n := r.Count(); n != MustReadFileSize(f) {
t.Fatalf("wrong count of bytes read, got %d, exp %d", n, MustReadFileSize(f))
}
}
func TestWALWriter_WriteMulti_Multiple(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
p1 := tsm1.NewValue(1, int64(1))
p2 := tsm1.NewValue(1, int64(2))
exp := []struct {
key string
values []tsm1.Value
}{
{"cpu,host=A#!~#value", []tsm1.Value{p1}},
{"cpu,host=B#!~#value", []tsm1.Value{p2}},
}
for _, v := range exp {
entry := &tsm1.WriteWALEntry{
Values: map[string][]tsm1.Value{v.key: v.values},
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
}
// Seek back to the beinning of the file for reading
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
for _, ep := range exp {
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err := r.Read()
if err != nil {
fatal(t, "read entry", err)
}
e, ok := we.(*tsm1.WriteWALEntry)
if !ok {
t.Fatalf("expected WriteWALEntry: got %#v", e)
}
for k, v := range e.Values {
if got, exp := k, ep.key; got != exp {
t.Fatalf("key mismatch. got %v, exp %v", got, exp)
}
if got, exp := len(v), len(ep.values); got != exp {
t.Fatalf("values length mismatch: got %v, exp %v", got, exp)
}
for i, vv := range v {
if got, exp := vv.String(), ep.values[i].String(); got != exp {
t.Fatalf("points mismatch: got %v, exp %v", got, exp)
}
}
}
}
if n := r.Count(); n != MustReadFileSize(f) {
t.Fatalf("wrong count of bytes read, got %d, exp %d", n, MustReadFileSize(f))
}
}
func TestWALWriter_WriteDelete_Single(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
entry := &tsm1.DeleteWALEntry{
Keys: []string{"cpu"},
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err := r.Read()
if err != nil {
fatal(t, "read entry", err)
}
e, ok := we.(*tsm1.DeleteWALEntry)
if !ok {
t.Fatalf("expected WriteWALEntry: got %#v", e)
}
if got, exp := len(e.Keys), len(entry.Keys); got != exp {
t.Fatalf("key length mismatch: got %v, exp %v", got, exp)
}
if got, exp := e.Keys[0], entry.Keys[0]; got != exp {
t.Fatalf("key mismatch: got %v, exp %v", got, exp)
}
}
func TestWALWriter_WriteMultiDelete_Multiple(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
p1 := tsm1.NewValue(1, true)
values := map[string][]tsm1.Value{
"cpu,host=A#!~#value": []tsm1.Value{p1},
}
writeEntry := &tsm1.WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(writeEntry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
// Write the delete entry
deleteEntry := &tsm1.DeleteWALEntry{
Keys: []string{"cpu,host=A#!~value"},
}
if err := w.Write(mustMarshalEntry(deleteEntry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
// Seek back to the beinning of the file for reading
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
// Read the write points first
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err := r.Read()
if err != nil {
fatal(t, "read entry", err)
}
e, ok := we.(*tsm1.WriteWALEntry)
if !ok {
t.Fatalf("expected WriteWALEntry: got %#v", e)
}
for k, v := range e.Values {
if got, exp := len(v), len(values[k]); got != exp {
t.Fatalf("values length mismatch: got %v, exp %v", got, exp)
}
for i, vv := range v {
if got, exp := vv.String(), values[k][i].String(); got != exp {
t.Fatalf("points mismatch: got %v, exp %v", got, exp)
}
}
}
// Read the delete second
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err = r.Read()
if err != nil {
fatal(t, "read entry", err)
}
de, ok := we.(*tsm1.DeleteWALEntry)
if !ok {
t.Fatalf("expected DeleteWALEntry: got %#v", e)
}
if got, exp := len(de.Keys), len(deleteEntry.Keys); got != exp {
t.Fatalf("key length mismatch: got %v, exp %v", got, exp)
}
if got, exp := de.Keys[0], deleteEntry.Keys[0]; got != exp {
t.Fatalf("key mismatch: got %v, exp %v", got, exp)
}
}
func TestWALWriter_WriteMultiDeleteRange_Multiple(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
p1 := tsm1.NewValue(1, 1.0)
p2 := tsm1.NewValue(2, 2.0)
p3 := tsm1.NewValue(3, 3.0)
values := map[string][]tsm1.Value{
"cpu,host=A#!~#value": []tsm1.Value{p1, p2, p3},
}
writeEntry := &tsm1.WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(writeEntry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
// Write the delete entry
deleteEntry := &tsm1.DeleteRangeWALEntry{
Keys: []string{"cpu,host=A#!~value"},
Min: 2,
Max: 3,
}
if err := w.Write(mustMarshalEntry(deleteEntry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
// Seek back to the beinning of the file for reading
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
// Read the write points first
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err := r.Read()
if err != nil {
fatal(t, "read entry", err)
}
e, ok := we.(*tsm1.WriteWALEntry)
if !ok {
t.Fatalf("expected WriteWALEntry: got %#v", e)
}
for k, v := range e.Values {
if got, exp := len(v), len(values[k]); got != exp {
t.Fatalf("values length mismatch: got %v, exp %v", got, exp)
}
for i, vv := range v {
if got, exp := vv.String(), values[k][i].String(); got != exp {
t.Fatalf("points mismatch: got %v, exp %v", got, exp)
}
}
}
// Read the delete second
if !r.Next() {
t.Fatalf("expected next, got false")
}
we, err = r.Read()
if err != nil {
fatal(t, "read entry", err)
}
de, ok := we.(*tsm1.DeleteRangeWALEntry)
if !ok {
t.Fatalf("expected DeleteWALEntry: got %#v", e)
}
if got, exp := len(de.Keys), len(deleteEntry.Keys); got != exp {
t.Fatalf("key length mismatch: got %v, exp %v", got, exp)
}
if got, exp := de.Keys[0], deleteEntry.Keys[0]; got != exp {
t.Fatalf("key mismatch: got %v, exp %v", got, exp)
}
if got, exp := de.Min, int64(2); got != exp {
t.Fatalf("min time mismatch: got %v, exp %v", got, exp)
}
if got, exp := de.Max, int64(3); got != exp {
t.Fatalf("min time mismatch: got %v, exp %v", got, exp)
}
}
func TestWAL_ClosedSegments(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
w := tsm1.NewWAL(dir)
if err := w.Open(); err != nil {
t.Fatalf("error opening WAL: %v", err)
}
files, err := w.ClosedSegments()
if err != nil {
t.Fatalf("error getting closed segments: %v", err)
}
if got, exp := len(files), 0; got != exp {
t.Fatalf("close segment length mismatch: got %v, exp %v", got, exp)
}
if _, err := w.WriteMulti(map[string][]tsm1.Value{
"cpu,host=A#!~#value": []tsm1.Value{
tsm1.NewValue(1, 1.1),
},
}); err != nil {
t.Fatalf("error writing points: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("error closing wal: %v", err)
}
// Re-open the WAL
w = tsm1.NewWAL(dir)
defer w.Close()
if err := w.Open(); err != nil {
t.Fatalf("error opening WAL: %v", err)
}
files, err = w.ClosedSegments()
if err != nil {
t.Fatalf("error getting closed segments: %v", err)
}
if got, exp := len(files), 1; got != exp {
t.Fatalf("close segment length mismatch: got %v, exp %v", got, exp)
}
}
func TestWAL_Delete(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
w := tsm1.NewWAL(dir)
if err := w.Open(); err != nil {
t.Fatalf("error opening WAL: %v", err)
}
files, err := w.ClosedSegments()
if err != nil {
t.Fatalf("error getting closed segments: %v", err)
}
if got, exp := len(files), 0; got != exp {
t.Fatalf("close segment length mismatch: got %v, exp %v", got, exp)
}
if _, err := w.Delete([]string{"cpu"}); err != nil {
t.Fatalf("error writing points: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("error closing wal: %v", err)
}
// Re-open the WAL
w = tsm1.NewWAL(dir)
defer w.Close()
if err := w.Open(); err != nil {
t.Fatalf("error opening WAL: %v", err)
}
files, err = w.ClosedSegments()
if err != nil {
t.Fatalf("error getting closed segments: %v", err)
}
if got, exp := len(files), 1; got != exp {
t.Fatalf("close segment length mismatch: got %v, exp %v", got, exp)
}
}
func TestWALWriter_Corrupt(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
corruption := []byte{1, 4, 0, 0, 0}
p1 := tsm1.NewValue(1, 1.1)
values := map[string][]tsm1.Value{
"cpu,host=A#!~#float": []tsm1.Value{p1},
}
entry := &tsm1.WriteWALEntry{
Values: values,
}
if err := w.Write(mustMarshalEntry(entry)); err != nil {
fatal(t, "write points", err)
}
if err := w.Flush(); err != nil {
fatal(t, "flush", err)
}
// Write some random bytes to the file to simulate corruption.
if _, err := f.Write(corruption); err != nil {
fatal(t, "corrupt WAL segment", err)
}
// Create the WAL segment reader.
if _, err := f.Seek(0, io.SeekStart); err != nil {
fatal(t, "seek", err)
}
r := tsm1.NewWALSegmentReader(f)
// Try to decode two entries.
if !r.Next() {
t.Fatalf("expected next, got false")
}
if _, err := r.Read(); err != nil {
fatal(t, "read entry", err)
}
if !r.Next() {
t.Fatalf("expected next, got false")
}
if _, err := r.Read(); err == nil {
fatal(t, "read entry did not return err", nil)
}
// Count should only return size of valid data.
expCount := MustReadFileSize(f) - int64(len(corruption))
if n := r.Count(); n != expCount {
t.Fatalf("wrong count of bytes read, got %d, exp %d", n, expCount)
}
}
func TestWriteWALSegment_UnmarshalBinary_WriteWALCorrupt(t *testing.T) {
p1 := tsm1.NewValue(1, 1.1)
p2 := tsm1.NewValue(1, int64(1))
p3 := tsm1.NewValue(1, true)
p4 := tsm1.NewValue(1, "string")
values := map[string][]tsm1.Value{
"cpu,host=A#!~#float": []tsm1.Value{p1, p1},
"cpu,host=A#!~#int": []tsm1.Value{p2, p2},
"cpu,host=A#!~#bool": []tsm1.Value{p3, p3},
"cpu,host=A#!~#string": []tsm1.Value{p4, p4},
}
w := &tsm1.WriteWALEntry{
Values: values,
}
b, err := w.MarshalBinary()
if err != nil {
t.Fatalf("unexpected error, got %v", err)
}
// Test every possible truncation of a write WAL entry
for i := 0; i < len(b); i++ {
// re-allocated to ensure capacity would be exceed if slicing
truncated := make([]byte, i)
copy(truncated, b[:i])
err := w.UnmarshalBinary(truncated)
if err != nil && err != tsm1.ErrWALCorrupt {
t.Fatalf("unexpected error: %v", err)
}
}
}
func TestWriteWALSegment_UnmarshalBinary_DeleteWALCorrupt(t *testing.T) {
w := &tsm1.DeleteWALEntry{
Keys: []string{"foo", "bar"},
}
b, err := w.MarshalBinary()
if err != nil {
t.Fatalf("unexpected error, got %v", err)
}
// Test every possible truncation of a write WAL entry
for i := 0; i < len(b); i++ {
// re-allocated to ensure capacity would be exceed if slicing
truncated := make([]byte, i)
copy(truncated, b[:i])
err := w.UnmarshalBinary(truncated)
if err != nil && err != tsm1.ErrWALCorrupt {
t.Fatalf("unexpected error: %v", err)
}
}
}
func TestWriteWALSegment_UnmarshalBinary_DeleteRangeWALCorrupt(t *testing.T) {
w := &tsm1.DeleteRangeWALEntry{
Keys: []string{"foo", "bar"},
Min: 1,
Max: 2,
}
b, err := w.MarshalBinary()
if err != nil {
t.Fatalf("unexpected error, got %v", err)
}
// Test every possible truncation of a write WAL entry
for i := 0; i < len(b); i++ {
// re-allocated to ensure capacity would be exceed if slicing
truncated := make([]byte, i)
copy(truncated, b[:i])
err := w.UnmarshalBinary(truncated)
if err != nil && err != tsm1.ErrWALCorrupt {
t.Fatalf("unexpected error: %v", err)
}
}
}
func BenchmarkWALSegmentWriter(b *testing.B) {
points := map[string][]tsm1.Value{}
for i := 0; i < 5000; i++ {
k := "cpu,host=A#!~#value"
points[k] = append(points[k], tsm1.NewValue(int64(i), 1.1))
}
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
write := &tsm1.WriteWALEntry{
Values: points,
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
if err := w.Write(mustMarshalEntry(write)); err != nil {
b.Fatalf("unexpected error writing entry: %v", err)
}
}
}
func BenchmarkWALSegmentReader(b *testing.B) {
points := map[string][]tsm1.Value{}
for i := 0; i < 5000; i++ {
k := "cpu,host=A#!~#value"
points[k] = append(points[k], tsm1.NewValue(int64(i), 1.1))
}
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w := tsm1.NewWALSegmentWriter(f)
write := &tsm1.WriteWALEntry{
Values: points,
}
for i := 0; i < 100; i++ {
if err := w.Write(mustMarshalEntry(write)); err != nil {
b.Fatalf("unexpected error writing entry: %v", err)
}
}
r := tsm1.NewWALSegmentReader(f)
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
f.Seek(0, io.SeekStart)
b.StartTimer()
for r.Next() {
_, err := r.Read()
if err != nil {
b.Fatalf("unexpected error reading entry: %v", err)
}
}
}
}
// MustReadFileSize returns the size of the file, or panics.
func MustReadFileSize(f *os.File) int64 {
stat, err := os.Stat(f.Name())
if err != nil {
panic(fmt.Sprintf("failed to get size of file at %s: %s", f.Name(), err.Error()))
}
return stat.Size()
}
func mustMarshalEntry(entry tsm1.WALEntry) (tsm1.WalEntryType, []byte) {
bytes := make([]byte, 1024<<2)
b, err := entry.Encode(bytes)
if err != nil {
panic(fmt.Sprintf("error encoding: %v", err))
}
return entry.Type(), snappy.Encode(b, b)
}

View File

@@ -0,0 +1,632 @@
package tsm1
/*
A TSM file is composed for four sections: header, blocks, index and the footer.
┌────────┬────────────────────────────────────┬─────────────┬──────────────┐
│ Header │ Blocks │ Index │ Footer │
│5 bytes │ N bytes │ N bytes │ 4 bytes │
└────────┴────────────────────────────────────┴─────────────┴──────────────┘
Header is composed of a magic number to identify the file type and a version
number.
┌───────────────────┐
│ Header │
├─────────┬─────────┤
│ Magic │ Version │
│ 4 bytes │ 1 byte │
└─────────┴─────────┘
Blocks are sequences of pairs of CRC32 and data. The block data is opaque to the
file. The CRC32 is used for block level error detection. The length of the blocks
is stored in the index.
┌───────────────────────────────────────────────────────────┐
│ Blocks │
├───────────────────┬───────────────────┬───────────────────┤
│ Block 1 │ Block 2 │ Block N │
├─────────┬─────────┼─────────┬─────────┼─────────┬─────────┤
│ CRC │ Data │ CRC │ Data │ CRC │ Data │
│ 4 bytes │ N bytes │ 4 bytes │ N bytes │ 4 bytes │ N bytes │
└─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
Following the blocks is the index for the blocks in the file. The index is
composed of a sequence of index entries ordered lexicographically by key and
then by time. Each index entry starts with a key length and key followed by a
count of the number of blocks in the file. Each block entry is composed of
the min and max time for the block, the offset into the file where the block
is located and the the size of the block.
The index structure can provide efficient access to all blocks as well as the
ability to determine the cost associated with acessing a given key. Given a key
and timestamp, we can determine whether a file contains the block for that
timestamp as well as where that block resides and how much data to read to
retrieve the block. If we know we need to read all or multiple blocks in a
file, we can use the size to determine how much to read in a given IO.
┌────────────────────────────────────────────────────────────────────────────┐
│ Index │
├─────────┬─────────┬──────┬───────┬─────────┬─────────┬────────┬────────┬───┤
│ Key Len │ Key │ Type │ Count │Min Time │Max Time │ Offset │ Size │...│
│ 2 bytes │ N bytes │1 byte│2 bytes│ 8 bytes │ 8 bytes │8 bytes │4 bytes │ │
└─────────┴─────────┴──────┴───────┴─────────┴─────────┴────────┴────────┴───┘
The last section is the footer that stores the offset of the start of the index.
┌─────────┐
│ Footer │
├─────────┤
│Index Ofs│
│ 8 bytes │
└─────────┘
*/
import (
"bufio"
"bytes"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"os"
"sort"
"sync"
"time"
)
const (
// MagicNumber is written as the first 4 bytes of a data file to
// identify the file as a tsm1 formatted file
MagicNumber uint32 = 0x16D116D1
// Version indicates the version of the TSM file format.
Version byte = 1
// Size in bytes of an index entry
indexEntrySize = 28
// Size in bytes used to store the count of index entries for a key
indexCountSize = 2
// Size in bytes used to store the type of block encoded
indexTypeSize = 1
// Max number of blocks for a given key that can exist in a single file
maxIndexEntries = (1 << (indexCountSize * 8)) - 1
// max length of a key in an index entry (measurement + tags)
maxKeyLength = (1 << (2 * 8)) - 1
)
var (
//ErrNoValues is returned when TSMWriter.WriteIndex is called and there are no values to write.
ErrNoValues = fmt.Errorf("no values written")
// ErrTSMClosed is returned when performing an operation against a closed TSM file.
ErrTSMClosed = fmt.Errorf("tsm file closed")
// ErrMaxKeyLengthExceeded is returned when attempting to write a key that is too long.
ErrMaxKeyLengthExceeded = fmt.Errorf("max key length exceeded")
// ErrMaxBlocksExceeded is returned when attempting to write a block past the allowed number.
ErrMaxBlocksExceeded = fmt.Errorf("max blocks exceeded")
)
// TSMWriter writes TSM formatted key and values.
type TSMWriter interface {
// Write writes a new block for key containing and values. Writes append
// blocks in the order that the Write function is called. The caller is
// responsible for ensuring keys and blocks are sorted appropriately.
// Values are encoded as a full block. The caller is responsible for
// ensuring a fixed number of values are encoded in each block as well as
// ensuring the Values are sorted. The first and last timestamp values are
// used as the minimum and maximum values for the index entry.
Write(key string, values Values) error
// WriteBlock writes a new block for key containing the bytes in block. WriteBlock appends
// blocks in the order that the WriteBlock function is called. The caller is
// responsible for ensuring keys and blocks are sorted appropriately, and that the
// block and index information is correct for the block. The minTime and maxTime
// timestamp values are used as the minimum and maximum values for the index entry.
WriteBlock(key string, minTime, maxTime int64, block []byte) error
// WriteIndex finishes the TSM write streams and writes the index.
WriteIndex() error
// Flushes flushes all pending changes to the underlying file resources.
Flush() error
// Close closes any underlying file resources.
Close() error
// Size returns the current size in bytes of the file.
Size() uint32
}
// IndexWriter writes a TSMIndex.
type IndexWriter interface {
// Add records a new block entry for a key in the index.
Add(key string, blockType byte, minTime, maxTime int64, offset int64, size uint32)
// Entries returns all index entries for a key.
Entries(key string) []IndexEntry
// Keys returns the unique set of keys in the index.
Keys() []string
// KeyCount returns the count of unique keys in the index.
KeyCount() int
// Size returns the size of a the current index in bytes.
Size() uint32
// MarshalBinary returns a byte slice encoded version of the index.
MarshalBinary() ([]byte, error)
// WriteTo writes the index contents to a writer.
WriteTo(w io.Writer) (int64, error)
}
// IndexEntry is the index information for a given block in a TSM file.
type IndexEntry struct {
// The min and max time of all points stored in the block.
MinTime, MaxTime int64
// The absolute position in the file where this block is located.
Offset int64
// The size in bytes of the block in the file.
Size uint32
}
// UnmarshalBinary decodes an IndexEntry from a byte slice.
func (e *IndexEntry) UnmarshalBinary(b []byte) error {
if len(b) != indexEntrySize {
return fmt.Errorf("unmarshalBinary: short buf: %v != %v", indexEntrySize, len(b))
}
e.MinTime = int64(binary.BigEndian.Uint64(b[:8]))
e.MaxTime = int64(binary.BigEndian.Uint64(b[8:16]))
e.Offset = int64(binary.BigEndian.Uint64(b[16:24]))
e.Size = binary.BigEndian.Uint32(b[24:28])
return nil
}
// AppendTo writes a binary-encoded version of IndexEntry to b, allocating
// and returning a new slice, if necessary.
func (e *IndexEntry) AppendTo(b []byte) []byte {
if len(b) < indexEntrySize {
if cap(b) < indexEntrySize {
b = make([]byte, indexEntrySize)
} else {
b = b[:indexEntrySize]
}
}
binary.BigEndian.PutUint64(b[:8], uint64(e.MinTime))
binary.BigEndian.PutUint64(b[8:16], uint64(e.MaxTime))
binary.BigEndian.PutUint64(b[16:24], uint64(e.Offset))
binary.BigEndian.PutUint32(b[24:28], uint32(e.Size))
return b
}
// Contains returns true if this IndexEntry may contain values for the given time.
// The min and max times are inclusive.
func (e *IndexEntry) Contains(t int64) bool {
return e.MinTime <= t && e.MaxTime >= t
}
// OverlapsTimeRange returns true if the given time ranges are completely within the entry's time bounds.
func (e *IndexEntry) OverlapsTimeRange(min, max int64) bool {
return e.MinTime <= max && e.MaxTime >= min
}
// String returns a string representation of the entry.
func (e *IndexEntry) String() string {
return fmt.Sprintf("min=%s max=%s ofs=%d siz=%d",
time.Unix(0, e.MinTime).UTC(), time.Unix(0, e.MaxTime).UTC(), e.Offset, e.Size)
}
// NewIndexWriter returns a new IndexWriter.
func NewIndexWriter() IndexWriter {
return &directIndex{
blocks: map[string]*indexEntries{},
}
}
// directIndex is a simple in-memory index implementation for a TSM file. The full index
// must fit in memory.
type directIndex struct {
mu sync.RWMutex
size uint32
blocks map[string]*indexEntries
}
func (d *directIndex) Add(key string, blockType byte, minTime, maxTime int64, offset int64, size uint32) {
d.mu.Lock()
defer d.mu.Unlock()
entries := d.blocks[key]
if entries == nil {
entries = &indexEntries{
Type: blockType,
}
d.blocks[key] = entries
// size of the key stored in the index
d.size += uint32(2 + len(key))
// size of the count of entries stored in the index
d.size += indexCountSize
}
entries.entries = append(entries.entries, IndexEntry{
MinTime: minTime,
MaxTime: maxTime,
Offset: offset,
Size: size,
})
// size of the encoded index entry
d.size += indexEntrySize
}
func (d *directIndex) entries(key string) []IndexEntry {
entries := d.blocks[key]
if entries == nil {
return nil
}
return entries.entries
}
func (d *directIndex) Entries(key string) []IndexEntry {
d.mu.RLock()
defer d.mu.RUnlock()
return d.entries(key)
}
func (d *directIndex) Entry(key string, t int64) *IndexEntry {
d.mu.RLock()
defer d.mu.RUnlock()
entries := d.entries(key)
for _, entry := range entries {
if entry.Contains(t) {
return &entry
}
}
return nil
}
func (d *directIndex) Keys() []string {
d.mu.RLock()
defer d.mu.RUnlock()
var keys []string
for k := range d.blocks {
keys = append(keys, k)
}
sort.Strings(keys)
return keys
}
func (d *directIndex) KeyCount() int {
d.mu.RLock()
n := len(d.blocks)
d.mu.RUnlock()
return n
}
func (d *directIndex) addEntries(key string, entries *indexEntries) {
existing := d.blocks[key]
if existing == nil {
d.blocks[key] = entries
return
}
existing.entries = append(existing.entries, entries.entries...)
}
func (d *directIndex) WriteTo(w io.Writer) (int64, error) {
d.mu.RLock()
defer d.mu.RUnlock()
// Index blocks are writtens sorted by key
keys := make([]string, 0, len(d.blocks))
for k := range d.blocks {
keys = append(keys, k)
}
sort.Strings(keys)
var (
n int
err error
buf [5]byte
N int64
)
// For each key, individual entries are sorted by time
for _, key := range keys {
entries := d.blocks[key]
if entries.Len() > maxIndexEntries {
return N, fmt.Errorf("key '%s' exceeds max index entries: %d > %d", key, entries.Len(), maxIndexEntries)
}
sort.Sort(entries)
binary.BigEndian.PutUint16(buf[0:2], uint16(len(key)))
buf[2] = entries.Type
binary.BigEndian.PutUint16(buf[3:5], uint16(entries.Len()))
// Append the key length and key
if n, err = w.Write(buf[0:2]); err != nil {
return int64(n) + N, fmt.Errorf("write: writer key length error: %v", err)
}
N += int64(n)
if n, err = io.WriteString(w, key); err != nil {
return int64(n) + N, fmt.Errorf("write: writer key error: %v", err)
}
N += int64(n)
// Append the block type and count
if n, err = w.Write(buf[2:5]); err != nil {
return int64(n) + N, fmt.Errorf("write: writer block type and count error: %v", err)
}
N += int64(n)
// Append each index entry for all blocks for this key
var n64 int64
if n64, err = entries.WriteTo(w); err != nil {
return n64 + N, fmt.Errorf("write: writer entries error: %v", err)
}
N += n64
}
return N, nil
}
func (d *directIndex) MarshalBinary() ([]byte, error) {
var b bytes.Buffer
if _, err := d.WriteTo(&b); err != nil {
return nil, err
}
return b.Bytes(), nil
}
func (d *directIndex) UnmarshalBinary(b []byte) error {
d.mu.Lock()
defer d.mu.Unlock()
d.size = uint32(len(b))
var pos int
for pos < len(b) {
n, key, err := readKey(b[pos:])
if err != nil {
return fmt.Errorf("readIndex: read key error: %v", err)
}
pos += n
var entries indexEntries
n, err = readEntries(b[pos:], &entries)
if err != nil {
return fmt.Errorf("readIndex: read entries error: %v", err)
}
pos += n
d.addEntries(string(key), &entries)
}
return nil
}
func (d *directIndex) Size() uint32 {
return d.size
}
// tsmWriter writes keys and values in the TSM format
type tsmWriter struct {
wrapped io.Writer
w *bufio.Writer
index IndexWriter
n int64
}
// NewTSMWriter returns a new TSMWriter writing to w.
func NewTSMWriter(w io.Writer) (TSMWriter, error) {
index := &directIndex{
blocks: map[string]*indexEntries{},
}
return &tsmWriter{wrapped: w, w: bufio.NewWriterSize(w, 1024*1024), index: index}, nil
}
func (t *tsmWriter) writeHeader() error {
var buf [5]byte
binary.BigEndian.PutUint32(buf[0:4], MagicNumber)
buf[4] = Version
n, err := t.w.Write(buf[:])
if err != nil {
return err
}
t.n = int64(n)
return nil
}
// Write writes a new block containing key and values.
func (t *tsmWriter) Write(key string, values Values) error {
if len(key) > maxKeyLength {
return ErrMaxKeyLengthExceeded
}
// Nothing to write
if len(values) == 0 {
return nil
}
// Write header only after we have some data to write.
if t.n == 0 {
if err := t.writeHeader(); err != nil {
return err
}
}
block, err := values.Encode(nil)
if err != nil {
return err
}
blockType, err := BlockType(block)
if err != nil {
return err
}
var checksum [crc32.Size]byte
binary.BigEndian.PutUint32(checksum[:], crc32.ChecksumIEEE(block))
_, err = t.w.Write(checksum[:])
if err != nil {
return err
}
n, err := t.w.Write(block)
if err != nil {
return err
}
n += len(checksum)
// Record this block in index
t.index.Add(key, blockType, values[0].UnixNano(), values[len(values)-1].UnixNano(), t.n, uint32(n))
// Increment file position pointer
t.n += int64(n)
return nil
}
// WriteBlock writes block for the given key and time range to the TSM file. If the write
// exceeds max entries for a given key, ErrMaxBlocksExceeded is returned. This indicates
// that the index is now full for this key and no future writes to this key will succeed.
func (t *tsmWriter) WriteBlock(key string, minTime, maxTime int64, block []byte) error {
if len(key) > maxKeyLength {
return ErrMaxKeyLengthExceeded
}
// Nothing to write
if len(block) == 0 {
return nil
}
blockType, err := BlockType(block)
if err != nil {
return err
}
// Write header only after we have some data to write.
if t.n == 0 {
if err := t.writeHeader(); err != nil {
return err
}
}
var checksum [crc32.Size]byte
binary.BigEndian.PutUint32(checksum[:], crc32.ChecksumIEEE(block))
_, err = t.w.Write(checksum[:])
if err != nil {
return err
}
n, err := t.w.Write(block)
if err != nil {
return err
}
n += len(checksum)
// Record this block in index
t.index.Add(key, blockType, minTime, maxTime, t.n, uint32(n))
// Increment file position pointer (checksum + block len)
t.n += int64(n)
if len(t.index.Entries(key)) >= maxIndexEntries {
return ErrMaxBlocksExceeded
}
return nil
}
// WriteIndex writes the index section of the file. If there are no index entries to write,
// this returns ErrNoValues.
func (t *tsmWriter) WriteIndex() error {
indexPos := t.n
if t.index.KeyCount() == 0 {
return ErrNoValues
}
// Write the index
if _, err := t.index.WriteTo(t.w); err != nil {
return err
}
var buf [8]byte
binary.BigEndian.PutUint64(buf[:], uint64(indexPos))
// Write the index index position
_, err := t.w.Write(buf[:])
return err
}
func (t *tsmWriter) Flush() error {
if err := t.w.Flush(); err != nil {
return err
}
if f, ok := t.wrapped.(*os.File); ok {
if err := f.Sync(); err != nil {
return err
}
}
return nil
}
func (t *tsmWriter) Close() error {
if err := t.Flush(); err != nil {
return err
}
if c, ok := t.wrapped.(io.Closer); ok {
return c.Close()
}
return nil
}
func (t *tsmWriter) Size() uint32 {
return uint32(t.n) + t.index.Size()
}
// verifyVersion verifies that the reader's bytes are a TSM byte
// stream of the correct version (1)
func verifyVersion(r io.ReadSeeker) error {
_, err := r.Seek(0, 0)
if err != nil {
return fmt.Errorf("init: failed to seek: %v", err)
}
var b [4]byte
_, err = io.ReadFull(r, b[:])
if err != nil {
return fmt.Errorf("init: error reading magic number of file: %v", err)
}
if binary.BigEndian.Uint32(b[:]) != MagicNumber {
return fmt.Errorf("can only read from tsm file")
}
_, err = io.ReadFull(r, b[:1])
if err != nil {
return fmt.Errorf("init: error reading version: %v", err)
}
if b[0] != Version {
return fmt.Errorf("init: file is version %b. expected %b", b[0], Version)
}
return nil
}

View File

@@ -0,0 +1,653 @@
package tsm1_test
import (
"bytes"
"encoding/binary"
"io"
"io/ioutil"
"os"
"testing"
"github.com/influxdata/influxdb/tsdb/engine/tsm1"
)
func TestTSMWriter_Write_Empty(t *testing.T) {
var b bytes.Buffer
w, err := tsm1.NewTSMWriter(&b)
if err != nil {
t.Fatalf("unexpected error created writer: %v", err)
}
if err := w.WriteIndex(); err != tsm1.ErrNoValues {
t.Fatalf("unexpected error closing: %v", err)
}
if got, exp := len(b.Bytes()), 0; got < exp {
t.Fatalf("file size mismatch: got %v, exp %v", got, exp)
}
}
func TestTSMWriter_Write_NoValues(t *testing.T) {
var b bytes.Buffer
w, err := tsm1.NewTSMWriter(&b)
if err != nil {
t.Fatalf("unexpected error created writer: %v", err)
}
if err := w.Write("foo", []tsm1.Value{}); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
if err := w.WriteIndex(); err != tsm1.ErrNoValues {
t.Fatalf("unexpected error closing: %v", err)
}
if got, exp := len(b.Bytes()), 0; got < exp {
t.Fatalf("file size mismatch: got %v, exp %v", got, exp)
}
}
func TestTSMWriter_Write_Single(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
values := []tsm1.Value{tsm1.NewValue(0, 1.0)}
if err := w.Write("cpu", values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error writing index: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
b, err := ioutil.ReadAll(fd)
if err != nil {
t.Fatalf("unexpected error reading: %v", err)
}
if got, exp := len(b), 5; got < exp {
t.Fatalf("file size mismatch: got %v, exp %v", got, exp)
}
if got := binary.BigEndian.Uint32(b[0:4]); got != tsm1.MagicNumber {
t.Fatalf("magic number mismatch: got %v, exp %v", got, tsm1.MagicNumber)
}
if _, err := fd.Seek(0, io.SeekStart); err != nil {
t.Fatalf("unexpected error seeking: %v", err)
}
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
readValues, err := r.ReadAll("cpu")
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if len(readValues) != len(values) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), len(values))
}
for i, v := range values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
func TestTSMWriter_Write_Multiple(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var data = []struct {
key string
values []tsm1.Value
}{
{"cpu", []tsm1.Value{tsm1.NewValue(0, 1.0)}},
{"mem", []tsm1.Value{tsm1.NewValue(1, 2.0)}},
}
for _, d := range data {
if err := w.Write(d.key, d.values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
for _, d := range data {
readValues, err := r.ReadAll(d.key)
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(d.values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range d.values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
}
func TestTSMWriter_Write_MultipleKeyValues(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var data = []struct {
key string
values []tsm1.Value
}{
{"cpu", []tsm1.Value{
tsm1.NewValue(0, 1.0),
tsm1.NewValue(1, 2.0)},
},
{"mem", []tsm1.Value{
tsm1.NewValue(0, 1.5),
tsm1.NewValue(1, 2.5)},
},
}
for _, d := range data {
if err := w.Write(d.key, d.values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
for _, d := range data {
readValues, err := r.ReadAll(d.key)
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(d.values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range d.values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
}
// Tests that writing keys in reverse is able to read them back.
func TestTSMWriter_Write_ReverseKeys(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var data = []struct {
key string
values []tsm1.Value
}{
{"mem", []tsm1.Value{
tsm1.NewValue(0, 1.5),
tsm1.NewValue(1, 2.5)},
},
{"cpu", []tsm1.Value{
tsm1.NewValue(0, 1.0),
tsm1.NewValue(1, 2.0)},
},
}
for _, d := range data {
if err := w.Write(d.key, d.values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
for _, d := range data {
readValues, err := r.ReadAll(d.key)
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(d.values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range d.values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
}
// Tests that writing keys in reverse is able to read them back.
func TestTSMWriter_Write_SameKey(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var data = []struct {
key string
values []tsm1.Value
}{
{"cpu", []tsm1.Value{
tsm1.NewValue(0, 1.0),
tsm1.NewValue(1, 2.0)},
},
{"cpu", []tsm1.Value{
tsm1.NewValue(2, 3.0),
tsm1.NewValue(3, 4.0)},
},
}
for _, d := range data {
if err := w.Write(d.key, d.values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
values := append(data[0].values, data[1].values...)
readValues, err := r.ReadAll("cpu")
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
// Tests that calling Read returns all the values for block matching the key
// and timestamp
func TestTSMWriter_Read_Multiple(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var data = []struct {
key string
values []tsm1.Value
}{
{"cpu", []tsm1.Value{
tsm1.NewValue(0, 1.0),
tsm1.NewValue(1, 2.0)},
},
{"cpu", []tsm1.Value{
tsm1.NewValue(2, 3.0),
tsm1.NewValue(3, 4.0)},
},
}
for _, d := range data {
if err := w.Write(d.key, d.values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
for _, values := range data {
// Try the first timestamp
readValues, err := r.Read("cpu", values.values[0].UnixNano())
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(values.values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range values.values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
// Try the last timestamp too
readValues, err = r.Read("cpu", values.values[1].UnixNano())
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(values.values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range values.values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
}
func TestTSMWriter_WriteBlock_Empty(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
if err := w.WriteBlock("cpu", 0, 0, nil); err != nil {
t.Fatalf("unexpected error writing block: %v", err)
}
if err := w.WriteIndex(); err != tsm1.ErrNoValues {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
defer fd.Close()
b, err := ioutil.ReadAll(fd)
if err != nil {
t.Fatalf("unexpected error read all: %v", err)
}
if got, exp := len(b), 0; got < exp {
t.Fatalf("file size mismatch: got %v, exp %v", got, exp)
}
}
func TestTSMWriter_WriteBlock_Multiple(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var data = []struct {
key string
values []tsm1.Value
}{
{"cpu", []tsm1.Value{tsm1.NewValue(0, 1.0)}},
{"mem", []tsm1.Value{tsm1.NewValue(1, 2.0)}},
}
for _, d := range data {
if err := w.Write(d.key, d.values); err != nil {
t.Fatalf("unexpected error writing: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err := os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
defer fd.Close()
b, err := ioutil.ReadAll(fd)
if err != nil {
t.Fatalf("unexpected error read all: %v", err)
}
if got, exp := len(b), 5; got < exp {
t.Fatalf("file size mismatch: got %v, exp %v", got, exp)
}
if got := binary.BigEndian.Uint32(b[0:4]); got != tsm1.MagicNumber {
t.Fatalf("magic number mismatch: got %v, exp %v", got, tsm1.MagicNumber)
}
if _, err := fd.Seek(0, io.SeekStart); err != nil {
t.Fatalf("error seeking: %v", err)
}
// Create reader for that file
r, err := tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
f = MustTempFile(dir)
w, err = tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
iter := r.BlockIterator()
for iter.Next() {
key, minTime, maxTime, _, _, b, err := iter.Read()
if err != nil {
t.Fatalf("unexpected error reading block: %v", err)
}
if err := w.WriteBlock(key, minTime, maxTime, b); err != nil {
t.Fatalf("unexpected error writing block: %v", err)
}
}
if err := w.WriteIndex(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
if err := w.Close(); err != nil {
t.Fatalf("unexpected error closing: %v", err)
}
fd, err = os.Open(f.Name())
if err != nil {
t.Fatalf("unexpected error open file: %v", err)
}
// Now create a reader to verify the written blocks matches the originally
// written file using Write
r, err = tsm1.NewTSMReader(fd)
if err != nil {
t.Fatalf("unexpected error created reader: %v", err)
}
defer r.Close()
for _, d := range data {
readValues, err := r.ReadAll(d.key)
if err != nil {
t.Fatalf("unexpected error readin: %v", err)
}
if exp := len(d.values); exp != len(readValues) {
t.Fatalf("read values length mismatch: got %v, exp %v", len(readValues), exp)
}
for i, v := range d.values {
if v.Value() != readValues[i].Value() {
t.Fatalf("read value mismatch(%d): got %v, exp %d", i, readValues[i].Value(), v.Value())
}
}
}
}
func TestTSMWriter_WriteBlock_MaxKey(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error creating writer: %v", err)
}
var key string
for i := 0; i < 100000; i++ {
key += "a"
}
if err := w.WriteBlock(key, 0, 0, nil); err != tsm1.ErrMaxKeyLengthExceeded {
t.Fatalf("expected max key length error writing key: %v", err)
}
}
func TestTSMWriter_Write_MaxKey(t *testing.T) {
dir := MustTempDir()
defer os.RemoveAll(dir)
f := MustTempFile(dir)
defer f.Close()
w, err := tsm1.NewTSMWriter(f)
if err != nil {
t.Fatalf("unexpected error created writer: %v", err)
}
var key string
for i := 0; i < 100000; i++ {
key += "a"
}
if err := w.Write(key, []tsm1.Value{tsm1.NewValue(0, 1.0)}); err != tsm1.ErrMaxKeyLengthExceeded {
t.Fatalf("expected max key length error writing key: %v", err)
}
}

129
vendor/github.com/influxdata/influxdb/tsdb/index.go generated vendored Normal file
View File

@@ -0,0 +1,129 @@
package tsdb
import (
"fmt"
"os"
"regexp"
"sort"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/uber-go/zap"
)
type Index interface {
Open() error
Close() error
WithLogger(zap.Logger)
MeasurementExists(name []byte) (bool, error)
MeasurementNamesByExpr(expr influxql.Expr) ([][]byte, error)
MeasurementNamesByRegex(re *regexp.Regexp) ([][]byte, error)
DropMeasurement(name []byte) error
ForEachMeasurementName(fn func(name []byte) error) error
InitializeSeries(key, name []byte, tags models.Tags) error
CreateSeriesIfNotExists(key, name []byte, tags models.Tags) error
CreateSeriesListIfNotExists(keys, names [][]byte, tags []models.Tags) error
DropSeries(key []byte) error
SeriesSketches() (estimator.Sketch, estimator.Sketch, error)
MeasurementsSketches() (estimator.Sketch, estimator.Sketch, error)
SeriesN() int64
HasTagKey(name, key []byte) (bool, error)
TagSets(name []byte, options influxql.IteratorOptions) ([]*influxql.TagSet, error)
MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error)
MeasurementTagKeyValuesByExpr(name []byte, keys []string, expr influxql.Expr, keysSorted bool) ([][]string, error)
ForEachMeasurementTagKey(name []byte, fn func(key []byte) error) error
TagKeyCardinality(name, key []byte) int
// InfluxQL system iterators
MeasurementSeriesKeysByExpr(name []byte, condition influxql.Expr) ([][]byte, error)
ForEachMeasurementSeriesByExpr(name []byte, expr influxql.Expr, fn func(tags models.Tags) error) error
SeriesPointIterator(opt influxql.IteratorOptions) (influxql.Iterator, error)
// Sets a shared fieldset from the engine.
SetFieldSet(fs *MeasurementFieldSet)
// Creates hard links inside path for snapshotting.
SnapshotTo(path string) error
// To be removed w/ tsi1.
SetFieldName(measurement []byte, name string)
AssignShard(k string, shardID uint64)
UnassignShard(k string, shardID uint64) error
RemoveShard(shardID uint64)
Type() string
}
// IndexFormat represents the format for an index.
type IndexFormat int
const (
// InMemFormat is the format used by the original in-memory shared index.
InMemFormat IndexFormat = 1
// TSI1Format is the format used by the tsi1 index.
TSI1Format IndexFormat = 2
)
// NewIndexFunc creates a new index.
type NewIndexFunc func(id uint64, database, path string, options EngineOptions) Index
// newIndexFuncs is a lookup of index constructors by name.
var newIndexFuncs = make(map[string]NewIndexFunc)
// RegisterIndex registers a storage index initializer by name.
func RegisterIndex(name string, fn NewIndexFunc) {
if _, ok := newIndexFuncs[name]; ok {
panic("index already registered: " + name)
}
newIndexFuncs[name] = fn
}
// RegisteredIndexs returns the slice of currently registered indexes.
func RegisteredIndexes() []string {
a := make([]string, 0, len(newIndexFuncs))
for k := range newIndexFuncs {
a = append(a, k)
}
sort.Strings(a)
return a
}
// NewIndex returns an instance of an index based on its format.
// If the path does not exist then the DefaultFormat is used.
func NewIndex(id uint64, database, path string, options EngineOptions) (Index, error) {
format := options.IndexVersion
// Use default format unless existing directory exists.
_, err := os.Stat(path)
if os.IsNotExist(err) {
// nop, use default
} else if err != nil {
return nil, err
} else if err == nil {
format = "tsi1"
}
// Lookup index by format.
fn := newIndexFuncs[format]
if fn == nil {
return nil, fmt.Errorf("invalid index format: %q", format)
}
return fn(id, database, path, options), nil
}
func MustOpenIndex(id uint64, database, path string, options EngineOptions) Index {
idx, err := NewIndex(id, database, path, options)
if err != nil {
panic(err)
} else if err := idx.Open(); err != nil {
panic(err)
}
return idx
}

View File

@@ -0,0 +1,6 @@
package index // import "github.com/influxdata/influxdb/tsdb/index"
import (
_ "github.com/influxdata/influxdb/tsdb/index/inmem"
_ "github.com/influxdata/influxdb/tsdb/index/tsi1"
)

View File

@@ -0,0 +1,988 @@
/*
Package inmem implements a shared, in-memory index for each database.
The in-memory index is the original index implementation and provides fast
access to index data. However, it also forces high memory usage for large
datasets and can cause OOM errors.
Index is the shared index structure that provides most of the functionality.
However, ShardIndex is a light per-shard wrapper that adapts this original
shared index format to the new per-shard format.
*/
package inmem
import (
"errors"
"fmt"
"regexp"
"sort"
"sync"
// "sync/atomic"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/bytesutil"
"github.com/influxdata/influxdb/pkg/escape"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/pkg/estimator/hll"
"github.com/influxdata/influxdb/tsdb"
"github.com/uber-go/zap"
)
// IndexName is the name of this index.
const IndexName = "inmem"
func init() {
tsdb.NewInmemIndex = func(name string) (interface{}, error) { return NewIndex(name), nil }
tsdb.RegisterIndex(IndexName, func(id uint64, database, path string, opt tsdb.EngineOptions) tsdb.Index {
return NewShardIndex(id, database, path, opt)
})
}
// Index is the in memory index of a collection of measurements, time
// series, and their tags. Exported functions are goroutine safe while
// un-exported functions assume the caller will use the appropriate locks.
type Index struct {
mu sync.RWMutex
database string
// In-memory metadata index, built on load and updated when new series come in
measurements map[string]*Measurement // measurement name to object and index
series map[string]*Series // map series key to the Series object
lastID uint64 // last used series ID. They're in memory only for this shard
seriesSketch, seriesTSSketch *hll.Plus
measurementsSketch, measurementsTSSketch *hll.Plus
}
// NewIndex returns a new initialized Index.
func NewIndex(database string) *Index {
index := &Index{
database: database,
measurements: make(map[string]*Measurement),
series: make(map[string]*Series),
}
index.seriesSketch = hll.NewDefaultPlus()
index.seriesTSSketch = hll.NewDefaultPlus()
index.measurementsSketch = hll.NewDefaultPlus()
index.measurementsTSSketch = hll.NewDefaultPlus()
return index
}
func (i *Index) Type() string { return IndexName }
func (i *Index) Open() (err error) { return nil }
func (i *Index) Close() error { return nil }
func (i *Index) WithLogger(zap.Logger) {}
// Series returns a series by key.
func (i *Index) Series(key []byte) (*Series, error) {
i.mu.RLock()
s := i.series[string(key)]
i.mu.RUnlock()
return s, nil
}
// SeriesSketches returns the sketches for the series.
func (i *Index) SeriesSketches() (estimator.Sketch, estimator.Sketch, error) {
i.mu.RLock()
defer i.mu.RUnlock()
return i.seriesSketch.Clone(), i.seriesTSSketch.Clone(), nil
}
// SeriesN returns the number of unique non-tombstoned series in the index.
// Since indexes are not shared across shards, the count returned by SeriesN
// cannot be combined with other shards' counts.
func (i *Index) SeriesN() int64 {
i.mu.RLock()
n := int64(len(i.series))
i.mu.RUnlock()
return n
}
// Measurement returns the measurement object from the index by the name
func (i *Index) Measurement(name []byte) (*Measurement, error) {
i.mu.RLock()
defer i.mu.RUnlock()
return i.measurements[string(name)], nil
}
// MeasurementExists returns true if the measurement exists.
func (i *Index) MeasurementExists(name []byte) (bool, error) {
i.mu.RLock()
defer i.mu.RUnlock()
return i.measurements[string(name)] != nil, nil
}
// MeasurementsSketches returns the sketches for the measurements.
func (i *Index) MeasurementsSketches() (estimator.Sketch, estimator.Sketch, error) {
i.mu.RLock()
defer i.mu.RUnlock()
return i.measurementsSketch.Clone(), i.measurementsTSSketch.Clone(), nil
}
// MeasurementsByName returns a list of measurements.
func (i *Index) MeasurementsByName(names [][]byte) ([]*Measurement, error) {
i.mu.RLock()
defer i.mu.RUnlock()
a := make([]*Measurement, 0, len(names))
for _, name := range names {
if m := i.measurements[string(name)]; m != nil {
a = append(a, m)
}
}
return a, nil
}
// CreateSeriesIfNotExists adds the series for the given measurement to the
// index and sets its ID or returns the existing series object
func (i *Index) CreateSeriesIfNotExists(shardID uint64, key, name []byte, tags models.Tags, opt *tsdb.EngineOptions, ignoreLimits bool) error {
i.mu.RLock()
// if there is a series for this id, it's already been added
ss := i.series[string(key)]
i.mu.RUnlock()
if ss != nil {
ss.AssignShard(shardID)
return nil
}
// get or create the measurement index
m := i.CreateMeasurementIndexIfNotExists(name)
i.mu.Lock()
// Check for the series again under a write lock
ss = i.series[string(key)]
if ss != nil {
i.mu.Unlock()
ss.AssignShard(shardID)
return nil
}
// Verify that the series will not exceed limit.
if !ignoreLimits {
if max := opt.Config.MaxSeriesPerDatabase; max > 0 && len(i.series)+1 > max {
i.mu.Unlock()
return errMaxSeriesPerDatabaseExceeded
}
}
// set the in memory ID for query processing on this shard
// The series key and tags are clone to prevent a memory leak
series := NewSeries([]byte(string(key)), tags.Clone())
series.ID = i.lastID + 1
i.lastID++
series.SetMeasurement(m)
i.series[string(key)] = series
m.AddSeries(series)
series.AssignShard(shardID)
// Add the series to the series sketch.
i.seriesSketch.Add(key)
i.mu.Unlock()
return nil
}
// CreateMeasurementIndexIfNotExists creates or retrieves an in memory index
// object for the measurement
func (i *Index) CreateMeasurementIndexIfNotExists(name []byte) *Measurement {
name = escape.Unescape(name)
// See if the measurement exists using a read-lock
i.mu.RLock()
m := i.measurements[string(name)]
if m != nil {
i.mu.RUnlock()
return m
}
i.mu.RUnlock()
// Doesn't exist, so lock the index to create it
i.mu.Lock()
defer i.mu.Unlock()
// Make sure it was created in between the time we released our read-lock
// and acquire the write lock
m = i.measurements[string(name)]
if m == nil {
m = NewMeasurement(i.database, string(name))
i.measurements[string(name)] = m
// Add the measurement to the measurements sketch.
i.measurementsSketch.Add([]byte(name))
}
return m
}
// HasTagKey returns true if tag key exists.
func (i *Index) HasTagKey(name, key []byte) (bool, error) {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return false, nil
}
return mm.HasTagKey(string(key)), nil
}
// HasTagValue returns true if tag value exists.
func (i *Index) HasTagValue(name, key, value []byte) bool {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return false
}
return mm.HasTagKeyValue(key, value)
}
// TagValueN returns the cardinality of a tag value.
func (i *Index) TagValueN(name, key []byte) int {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return 0
}
return mm.CardinalityBytes(key)
}
// MeasurementTagKeysByExpr returns an ordered set of tag keys filtered by an expression.
func (i *Index) MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error) {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return nil, nil
}
return mm.TagKeysByExpr(expr)
}
// MeasurementTagKeyValuesByExpr returns a set of tag values filtered by an expression.
//
// See tsm1.Engine.MeasurementTagKeyValuesByExpr for a fuller description of this
// method.
func (i *Index) MeasurementTagKeyValuesByExpr(name []byte, keys []string, expr influxql.Expr, keysSorted bool) ([][]string, error) {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil || len(keys) == 0 {
return nil, nil
}
results := make([][]string, len(keys))
// If we haven't been provided sorted keys, then we need to sort them.
if !keysSorted {
sort.Sort(sort.StringSlice(keys))
}
ids, _, _ := mm.WalkWhereForSeriesIds(expr)
if ids.Len() == 0 && expr == nil {
for ki, key := range keys {
values := mm.TagValues(key)
sort.Sort(sort.StringSlice(values))
results[ki] = values
}
return results, nil
}
// This is the case where we have filtered series by some WHERE condition.
// We only care about the tag values for the keys given the
// filtered set of series ids.
keyIdxs := make(map[string]int, len(keys))
for ki, key := range keys {
keyIdxs[key] = ki
}
resultSet := make([]stringSet, len(keys))
for i := 0; i < len(resultSet); i++ {
resultSet[i] = newStringSet()
}
// Iterate all series to collect tag values.
for _, id := range ids {
s := mm.SeriesByID(id)
if s == nil {
continue
}
// Iterate the tag keys we're interested in and collect values
// from this series, if they exist.
for _, t := range s.Tags() {
if idx, ok := keyIdxs[string(t.Key)]; ok {
resultSet[idx].add(string(t.Value))
} else if string(t.Key) > keys[len(keys)-1] {
// The tag key is > the largest key we're interested in.
break
}
}
}
for i, s := range resultSet {
results[i] = s.list()
}
return results, nil
}
// ForEachMeasurementTagKey iterates over all tag keys for a measurement.
func (i *Index) ForEachMeasurementTagKey(name []byte, fn func(key []byte) error) error {
// Ensure we do not hold a lock on the index while fn executes in case fn tries
// to acquire a lock on the index again. If another goroutine has Lock, this will
// deadlock.
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return nil
}
for _, key := range mm.TagKeys() {
if err := fn([]byte(key)); err != nil {
return err
}
}
return nil
}
// TagKeyCardinality returns the number of values for a measurement/tag key.
func (i *Index) TagKeyCardinality(name, key []byte) int {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return 0
}
return mm.CardinalityBytes(key)
}
// TagsForSeries returns the tag map for the passed in series
func (i *Index) TagsForSeries(key string) (models.Tags, error) {
i.mu.RLock()
ss := i.series[key]
i.mu.RUnlock()
if ss == nil {
return nil, nil
}
return ss.Tags(), nil
}
// MeasurementNamesByExpr takes an expression containing only tags and returns a
// list of matching meaurement names.
func (i *Index) MeasurementNamesByExpr(expr influxql.Expr) ([][]byte, error) {
i.mu.RLock()
defer i.mu.RUnlock()
// Return all measurement names if no expression is provided.
if expr == nil {
a := make([][]byte, 0, len(i.measurements))
for name := range i.measurements {
a = append(a, []byte(name))
}
bytesutil.Sort(a)
return a, nil
}
return i.measurementNamesByExpr(expr)
}
func (i *Index) measurementNamesByExpr(expr influxql.Expr) ([][]byte, error) {
if expr == nil {
return nil, nil
}
switch e := expr.(type) {
case *influxql.BinaryExpr:
switch e.Op {
case influxql.EQ, influxql.NEQ, influxql.EQREGEX, influxql.NEQREGEX:
tag, ok := e.LHS.(*influxql.VarRef)
if !ok {
return nil, fmt.Errorf("left side of '%s' must be a tag key", e.Op.String())
}
tf := &TagFilter{
Op: e.Op,
Key: tag.Val,
}
if influxql.IsRegexOp(e.Op) {
re, ok := e.RHS.(*influxql.RegexLiteral)
if !ok {
return nil, fmt.Errorf("right side of '%s' must be a regular expression", e.Op.String())
}
tf.Regex = re.Val
} else {
s, ok := e.RHS.(*influxql.StringLiteral)
if !ok {
return nil, fmt.Errorf("right side of '%s' must be a tag value string", e.Op.String())
}
tf.Value = s.Val
}
// Match on name, if specified.
if tag.Val == "_name" {
return i.measurementNamesByNameFilter(tf.Op, tf.Value, tf.Regex), nil
} else if influxql.IsSystemName(tag.Val) {
return nil, nil
}
return i.measurementNamesByTagFilters(tf), nil
case influxql.OR, influxql.AND:
lhs, err := i.measurementNamesByExpr(e.LHS)
if err != nil {
return nil, err
}
rhs, err := i.measurementNamesByExpr(e.RHS)
if err != nil {
return nil, err
}
if e.Op == influxql.OR {
return bytesutil.Union(lhs, rhs), nil
}
return bytesutil.Intersect(lhs, rhs), nil
default:
return nil, fmt.Errorf("invalid tag comparison operator")
}
case *influxql.ParenExpr:
return i.measurementNamesByExpr(e.Expr)
}
return nil, fmt.Errorf("%#v", expr)
}
// measurementNamesByNameFilter returns the sorted measurements matching a name.
func (i *Index) measurementNamesByNameFilter(op influxql.Token, val string, regex *regexp.Regexp) [][]byte {
var names [][]byte
for _, m := range i.measurements {
var matched bool
switch op {
case influxql.EQ:
matched = m.Name == val
case influxql.NEQ:
matched = m.Name != val
case influxql.EQREGEX:
matched = regex.MatchString(m.Name)
case influxql.NEQREGEX:
matched = !regex.MatchString(m.Name)
}
if !matched {
continue
}
names = append(names, []byte(m.Name))
}
bytesutil.Sort(names)
return names
}
// measurementNamesByTagFilters returns the sorted measurements matching the filters on tag values.
func (i *Index) measurementNamesByTagFilters(filter *TagFilter) [][]byte {
// Build a list of measurements matching the filters.
var names [][]byte
var tagMatch bool
// Iterate through all measurements in the database.
for _, m := range i.measurements {
tagVals := m.SeriesByTagKeyValue(filter.Key)
if tagVals == nil {
continue
}
tagMatch = false
// If the operator is non-regex, only check the specified value.
if filter.Op == influxql.EQ || filter.Op == influxql.NEQ {
if _, ok := tagVals[filter.Value]; ok {
tagMatch = true
}
} else {
// Else, the operator is a regex and we have to check all tag
// values against the regular expression.
for tagVal := range tagVals {
if filter.Regex.MatchString(tagVal) {
tagMatch = true
continue
}
}
}
//
// XNOR gate
//
// tags match | operation is EQ | measurement matches
// --------------------------------------------------
// True | True | True
// True | False | False
// False | True | False
// False | False | True
if tagMatch == (filter.Op == influxql.EQ || filter.Op == influxql.EQREGEX) {
names = append(names, []byte(m.Name))
continue
}
}
bytesutil.Sort(names)
return names
}
// MeasurementNamesByRegex returns the measurements that match the regex.
func (i *Index) MeasurementNamesByRegex(re *regexp.Regexp) ([][]byte, error) {
i.mu.RLock()
defer i.mu.RUnlock()
var matches [][]byte
for _, m := range i.measurements {
if re.MatchString(m.Name) {
matches = append(matches, []byte(m.Name))
}
}
return matches, nil
}
// DropMeasurement removes the measurement and all of its underlying
// series from the database index
func (i *Index) DropMeasurement(name []byte) error {
i.mu.Lock()
defer i.mu.Unlock()
return i.dropMeasurement(string(name))
}
func (i *Index) dropMeasurement(name string) error {
// Update the tombstone sketch.
i.measurementsTSSketch.Add([]byte(name))
m := i.measurements[name]
if m == nil {
return nil
}
delete(i.measurements, name)
for _, s := range m.SeriesByIDMap() {
delete(i.series, s.Key)
i.seriesTSSketch.Add([]byte(s.Key))
}
return nil
}
// DropSeries removes the series key and its tags from the index.
func (i *Index) DropSeries(key []byte) error {
if key == nil {
return nil
}
i.mu.Lock()
k := string(key)
series := i.series[k]
if series == nil {
i.mu.Unlock()
return nil
}
// Update the tombstone sketch.
i.seriesTSSketch.Add([]byte(k))
// Remove from the index.
delete(i.series, k)
// Remove the measurement's reference.
series.Measurement().DropSeries(series)
// If the measurement no longer has any series, remove it as well.
if !series.Measurement().HasSeries() {
i.dropMeasurement(series.Measurement().Name)
}
i.mu.Unlock()
return nil
}
// ForEachMeasurementSeriesByExpr iterates over all series in a measurement filtered by an expression.
func (i *Index) ForEachMeasurementSeriesByExpr(name []byte, expr influxql.Expr, fn func(tags models.Tags) error) error {
i.mu.RLock()
mm := i.measurements[string(name)]
i.mu.RUnlock()
if mm == nil {
return nil
}
if err := mm.ForEachSeriesByExpr(expr, fn); err != nil {
return err
}
return nil
}
// TagSets returns a list of tag sets.
func (i *Index) TagSets(shardID uint64, name []byte, opt influxql.IteratorOptions) ([]*influxql.TagSet, error) {
i.mu.RLock()
defer i.mu.RUnlock()
mm := i.measurements[string(name)]
if mm == nil {
return nil, nil
}
tagSets, err := mm.TagSets(shardID, opt)
if err != nil {
return nil, err
}
return tagSets, nil
}
func (i *Index) SeriesKeys() []string {
i.mu.RLock()
s := make([]string, 0, len(i.series))
for k := range i.series {
s = append(s, k)
}
i.mu.RUnlock()
return s
}
// SetFieldSet sets a shared field set from the engine.
func (i *Index) SetFieldSet(*tsdb.MeasurementFieldSet) {}
// SetFieldName adds a field name to a measurement.
func (i *Index) SetFieldName(measurement []byte, name string) {
m := i.CreateMeasurementIndexIfNotExists(measurement)
m.SetFieldName(name)
}
// ForEachMeasurementName iterates over each measurement name.
func (i *Index) ForEachMeasurementName(fn func(name []byte) error) error {
i.mu.RLock()
defer i.mu.RUnlock()
mms := make(Measurements, 0, len(i.measurements))
for _, m := range i.measurements {
mms = append(mms, m)
}
sort.Sort(mms)
for _, m := range mms {
if err := fn([]byte(m.Name)); err != nil {
return err
}
}
return nil
}
func (i *Index) MeasurementSeriesKeysByExpr(name []byte, condition influxql.Expr) ([][]byte, error) {
i.mu.RLock()
defer i.mu.RUnlock()
m := i.measurements[string(name)]
if m == nil {
return nil, nil
}
// Return all series if no condition specified.
if condition == nil {
return m.SeriesKeys(), nil
}
// Get series IDs that match the WHERE clause.
ids, filters, err := m.WalkWhereForSeriesIds(condition)
if err != nil {
return nil, err
}
// Delete boolean literal true filter expressions.
// These are returned for `WHERE tagKey = 'tagVal'` type expressions and are okay.
filters.DeleteBoolLiteralTrues()
// Check for unsupported field filters.
// Any remaining filters means there were fields (e.g., `WHERE value = 1.2`).
if filters.Len() > 0 {
return nil, errors.New("fields not supported in WHERE clause during deletion")
}
return m.SeriesKeysByID(ids), nil
}
// SeriesPointIterator returns an influxql iterator over all series.
func (i *Index) SeriesPointIterator(opt influxql.IteratorOptions) (influxql.Iterator, error) {
// Read and sort all measurements.
mms := make(Measurements, 0, len(i.measurements))
for _, mm := range i.measurements {
mms = append(mms, mm)
}
sort.Sort(mms)
return &seriesPointIterator{
mms: mms,
point: influxql.FloatPoint{
Aux: make([]interface{}, len(opt.Aux)),
},
opt: opt,
}, nil
}
// SnapshotTo is a no-op since this is an in-memory index.
func (i *Index) SnapshotTo(path string) error { return nil }
// AssignShard update the index to indicate that series k exists in the given shardID.
func (i *Index) AssignShard(k string, shardID uint64) {
ss, _ := i.Series([]byte(k))
if ss != nil {
ss.AssignShard(shardID)
}
}
// UnassignShard updates the index to indicate that series k does not exist in
// the given shardID.
func (i *Index) UnassignShard(k string, shardID uint64) error {
ss, _ := i.Series([]byte(k))
if ss != nil {
if ss.Assigned(shardID) {
// Remove the shard from any series
ss.UnassignShard(shardID)
// If this series no longer has shards assigned, remove the series
if ss.ShardN() == 0 {
// Remove the series key from the index.
return i.DropSeries([]byte(k))
}
}
}
return nil
}
// RemoveShard removes all references to shardID from any series or measurements
// in the index. If the shard was the only owner of data for the series, the series
// is removed from the index.
func (i *Index) RemoveShard(shardID uint64) {
for _, k := range i.SeriesKeys() {
i.UnassignShard(k, shardID)
}
}
// assignExistingSeries assigns the existings series to shardID and returns the series, names and tags that
// do not exists yet.
func (i *Index) assignExistingSeries(shardID uint64, keys, names [][]byte, tagsSlice []models.Tags) ([][]byte, [][]byte, []models.Tags) {
i.mu.RLock()
var n int
for j, key := range keys {
if ss, ok := i.series[string(key)]; !ok {
keys[n] = keys[j]
names[n] = names[j]
tagsSlice[n] = tagsSlice[j]
n++
} else {
ss.AssignShard(shardID)
}
}
i.mu.RUnlock()
return keys[:n], names[:n], tagsSlice[:n]
}
// Ensure index implements interface.
var _ tsdb.Index = &ShardIndex{}
// ShardIndex represents a shim between the TSDB index interface and the shared
// in-memory index. This is required because per-shard in-memory indexes will
// grow the heap size too large.
type ShardIndex struct {
*Index
id uint64 // shard id
opt tsdb.EngineOptions
}
// CreateSeriesListIfNotExists creates a list of series if they doesn't exist in bulk.
func (idx *ShardIndex) CreateSeriesListIfNotExists(keys, names [][]byte, tagsSlice []models.Tags) error {
keys, names, tagsSlice = idx.assignExistingSeries(idx.id, keys, names, tagsSlice)
if len(keys) == 0 {
return nil
}
var reason string
var dropped int
var droppedKeys map[string]struct{}
// Ensure that no tags go over the maximum cardinality.
if maxValuesPerTag := idx.opt.Config.MaxValuesPerTag; maxValuesPerTag > 0 {
var n int
outer:
for i, name := range names {
tags := tagsSlice[i]
for _, tag := range tags {
// Skip if the tag value already exists.
if idx.HasTagValue(name, tag.Key, tag.Value) {
continue
}
// Read cardinality. Skip if we're below the threshold.
n := idx.TagValueN(name, tag.Key)
if n < maxValuesPerTag {
continue
}
dropped++
reason = fmt.Sprintf("max-values-per-tag limit exceeded (%d/%d): measurement=%q tag=%q value=%q",
n, maxValuesPerTag, name, string(tag.Key), string(tag.Value))
if droppedKeys == nil {
droppedKeys = make(map[string]struct{})
}
droppedKeys[string(keys[i])] = struct{}{}
continue outer
}
// Increment success count if all checks complete.
keys[n], names[n], tagsSlice[n] = keys[i], names[i], tagsSlice[i]
n++
}
// Slice to only include successful points.
keys, names, tagsSlice = keys[:n], names[:n], tagsSlice[:n]
}
// Write
for i := range keys {
if err := idx.CreateSeriesIfNotExists(keys[i], names[i], tagsSlice[i]); err == errMaxSeriesPerDatabaseExceeded {
dropped++
reason = fmt.Sprintf("max-series-per-database limit exceeded: (%d)", idx.opt.Config.MaxSeriesPerDatabase)
if droppedKeys == nil {
droppedKeys = make(map[string]struct{})
}
droppedKeys[string(keys[i])] = struct{}{}
continue
} else if err != nil {
return err
}
}
// Report partial writes back to shard.
if dropped > 0 {
return &tsdb.PartialWriteError{
Reason: reason,
Dropped: dropped,
DroppedKeys: droppedKeys,
}
}
return nil
}
// InitializeSeries is called during startup.
// This works the same as CreateSeriesIfNotExists except it ignore limit errors.
func (i *ShardIndex) InitializeSeries(key, name []byte, tags models.Tags) error {
return i.Index.CreateSeriesIfNotExists(i.id, key, name, tags, &i.opt, true)
}
func (i *ShardIndex) CreateSeriesIfNotExists(key, name []byte, tags models.Tags) error {
return i.Index.CreateSeriesIfNotExists(i.id, key, name, tags, &i.opt, false)
}
// TagSets returns a list of tag sets based on series filtering.
func (i *ShardIndex) TagSets(name []byte, opt influxql.IteratorOptions) ([]*influxql.TagSet, error) {
return i.Index.TagSets(i.id, name, opt)
}
// NewShardIndex returns a new index for a shard.
func NewShardIndex(id uint64, database, path string, opt tsdb.EngineOptions) tsdb.Index {
return &ShardIndex{
Index: opt.InmemIndex.(*Index),
id: id,
opt: opt,
}
}
// seriesPointIterator emits series as influxql points.
type seriesPointIterator struct {
mms Measurements
keys struct {
buf []string
i int
}
point influxql.FloatPoint // reusable point
opt influxql.IteratorOptions
}
// Stats returns stats about the points processed.
func (itr *seriesPointIterator) Stats() influxql.IteratorStats { return influxql.IteratorStats{} }
// Close closes the iterator.
func (itr *seriesPointIterator) Close() error { return nil }
// Next emits the next point in the iterator.
func (itr *seriesPointIterator) Next() (*influxql.FloatPoint, error) {
for {
// Load next measurement's keys if there are no more remaining.
if itr.keys.i >= len(itr.keys.buf) {
if err := itr.nextKeys(); err != nil {
return nil, err
}
if len(itr.keys.buf) == 0 {
return nil, nil
}
}
// Read the next key.
key := itr.keys.buf[itr.keys.i]
itr.keys.i++
// Write auxiliary fields.
for i, f := range itr.opt.Aux {
switch f.Val {
case "key":
itr.point.Aux[i] = key
}
}
return &itr.point, nil
}
}
// nextKeys reads all keys for the next measurement.
func (itr *seriesPointIterator) nextKeys() error {
for {
// Ensure previous keys are cleared out.
itr.keys.i, itr.keys.buf = 0, itr.keys.buf[:0]
// Read next measurement.
if len(itr.mms) == 0 {
return nil
}
mm := itr.mms[0]
itr.mms = itr.mms[1:]
// Read all series keys.
ids, err := mm.SeriesIDsAllOrByExpr(itr.opt.Condition)
if err != nil {
return err
} else if len(ids) == 0 {
continue
}
itr.keys.buf = mm.AppendSeriesKeysByID(itr.keys.buf, ids)
sort.Strings(itr.keys.buf)
return nil
}
}
// errMaxSeriesPerDatabaseExceeded is a marker error returned during series creation
// to indicate that a new series would exceed the limits of the database.
var errMaxSeriesPerDatabaseExceeded = errors.New("max series per database exceeded")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,258 @@
package inmem_test
import (
"fmt"
"strings"
"testing"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb/index/inmem"
)
// Test comparing SeriesIDs for equality.
func TestSeriesIDs_Equals(t *testing.T) {
ids1 := inmem.SeriesIDs([]uint64{1, 2, 3})
ids2 := inmem.SeriesIDs([]uint64{1, 2, 3})
ids3 := inmem.SeriesIDs([]uint64{4, 5, 6})
if !ids1.Equals(ids2) {
t.Fatal("expected ids1 == ids2")
} else if ids1.Equals(ids3) {
t.Fatal("expected ids1 != ids3")
}
}
// Test intersecting sets of SeriesIDs.
func TestSeriesIDs_Intersect(t *testing.T) {
// Test swaping l & r, all branches of if-else, and exit loop when 'j < len(r)'
ids1 := inmem.SeriesIDs([]uint64{1, 3, 4, 5, 6})
ids2 := inmem.SeriesIDs([]uint64{1, 2, 3, 7})
exp := inmem.SeriesIDs([]uint64{1, 3})
got := ids1.Intersect(ids2)
if !exp.Equals(got) {
t.Fatalf("exp=%v, got=%v", exp, got)
}
// Test exit for loop when 'i < len(l)'
ids1 = inmem.SeriesIDs([]uint64{1})
ids2 = inmem.SeriesIDs([]uint64{1, 2})
exp = inmem.SeriesIDs([]uint64{1})
got = ids1.Intersect(ids2)
if !exp.Equals(got) {
t.Fatalf("exp=%v, got=%v", exp, got)
}
}
// Test union sets of SeriesIDs.
func TestSeriesIDs_Union(t *testing.T) {
// Test all branches of if-else, exit loop because of 'j < len(r)', and append remainder from left.
ids1 := inmem.SeriesIDs([]uint64{1, 2, 3, 7})
ids2 := inmem.SeriesIDs([]uint64{1, 3, 4, 5, 6})
exp := inmem.SeriesIDs([]uint64{1, 2, 3, 4, 5, 6, 7})
got := ids1.Union(ids2)
if !exp.Equals(got) {
t.Fatalf("exp=%v, got=%v", exp, got)
}
// Test exit because of 'i < len(l)' and append remainder from right.
ids1 = inmem.SeriesIDs([]uint64{1})
ids2 = inmem.SeriesIDs([]uint64{1, 2})
exp = inmem.SeriesIDs([]uint64{1, 2})
got = ids1.Union(ids2)
if !exp.Equals(got) {
t.Fatalf("exp=%v, got=%v", exp, got)
}
}
// Test removing one set of SeriesIDs from another.
func TestSeriesIDs_Reject(t *testing.T) {
// Test all branches of if-else, exit loop because of 'j < len(r)', and append remainder from left.
ids1 := inmem.SeriesIDs([]uint64{1, 2, 3, 7})
ids2 := inmem.SeriesIDs([]uint64{1, 3, 4, 5, 6})
exp := inmem.SeriesIDs([]uint64{2, 7})
got := ids1.Reject(ids2)
if !exp.Equals(got) {
t.Fatalf("exp=%v, got=%v", exp, got)
}
// Test exit because of 'i < len(l)'.
ids1 = inmem.SeriesIDs([]uint64{1})
ids2 = inmem.SeriesIDs([]uint64{1, 2})
exp = inmem.SeriesIDs{}
got = ids1.Reject(ids2)
if !exp.Equals(got) {
t.Fatalf("exp=%v, got=%v", exp, got)
}
}
func TestMeasurement_AppendSeriesKeysByID_Missing(t *testing.T) {
m := inmem.NewMeasurement("foo", "cpu")
var dst []string
dst = m.AppendSeriesKeysByID(dst, []uint64{1})
if exp, got := 0, len(dst); exp != got {
t.Fatalf("series len mismatch: exp %v, got %v", exp, got)
}
}
func TestMeasurement_AppendSeriesKeysByID_Exists(t *testing.T) {
m := inmem.NewMeasurement("foo", "cpu")
s := inmem.NewSeries([]byte("cpu,host=foo"), models.Tags{models.NewTag([]byte("host"), []byte("foo"))})
s.ID = 1
m.AddSeries(s)
var dst []string
dst = m.AppendSeriesKeysByID(dst, []uint64{1})
if exp, got := 1, len(dst); exp != got {
t.Fatalf("series len mismatch: exp %v, got %v", exp, got)
}
if exp, got := "cpu,host=foo", dst[0]; exp != got {
t.Fatalf("series mismatch: exp %v, got %v", exp, got)
}
}
func TestMeasurement_TagsSet_Deadlock(t *testing.T) {
m := inmem.NewMeasurement("foo", "cpu")
s1 := inmem.NewSeries([]byte("cpu,host=foo"), models.Tags{models.NewTag([]byte("host"), []byte("foo"))})
s1.ID = 1
m.AddSeries(s1)
s2 := inmem.NewSeries([]byte("cpu,host=bar"), models.Tags{models.NewTag([]byte("host"), []byte("bar"))})
s2.ID = 2
m.AddSeries(s2)
m.DropSeries(s1)
// This was deadlocking
m.TagSets(1, influxql.IteratorOptions{})
if got, exp := len(m.SeriesIDs()), 1; got != exp {
t.Fatalf("series count mismatch: got %v, exp %v", got, exp)
}
}
func TestMeasurement_ForEachSeriesByExpr_Deadlock(t *testing.T) {
m := inmem.NewMeasurement("foo", "cpu")
s1 := inmem.NewSeries([]byte("cpu,host=foo"), models.Tags{models.NewTag([]byte("host"), []byte("foo"))})
s1.ID = 1
m.AddSeries(s1)
s2 := inmem.NewSeries([]byte("cpu,host=bar"), models.Tags{models.NewTag([]byte("host"), []byte("bar"))})
s2.ID = 2
m.AddSeries(s2)
m.DropSeries(s1)
// This was deadlocking
m.ForEachSeriesByExpr(nil, func(tags models.Tags) error {
return nil
})
if got, exp := len(m.SeriesIDs()), 1; got != exp {
t.Fatalf("series count mismatch: got %v, exp %v", got, exp)
}
}
func BenchmarkMeasurement_SeriesIDForExp_EQRegex(b *testing.B) {
m := inmem.NewMeasurement("foo", "cpu")
for i := 0; i < 100000; i++ {
s := inmem.NewSeries([]byte("cpu"), models.Tags{models.NewTag(
[]byte("host"),
[]byte(fmt.Sprintf("host%d", i)))})
s.ID = uint64(i)
m.AddSeries(s)
}
if exp, got := 100000, len(m.SeriesKeys()); exp != got {
b.Fatalf("series count mismatch: exp %v got %v", exp, got)
}
stmt, err := influxql.NewParser(strings.NewReader(`SELECT * FROM cpu WHERE host =~ /host\d+/`)).ParseStatement()
if err != nil {
b.Fatalf("invalid statement: %s", err)
}
selectStmt := stmt.(*influxql.SelectStatement)
b.ResetTimer()
for i := 0; i < b.N; i++ {
ids := m.IDsForExpr(selectStmt.Condition.(*influxql.BinaryExpr))
if exp, got := 100000, len(ids); exp != got {
b.Fatalf("series count mismatch: exp %v got %v", exp, got)
}
}
}
func BenchmarkMeasurement_SeriesIDForExp_NERegex(b *testing.B) {
m := inmem.NewMeasurement("foo", "cpu")
for i := 0; i < 100000; i++ {
s := inmem.NewSeries([]byte("cpu"), models.Tags{models.Tag{
Key: []byte("host"),
Value: []byte(fmt.Sprintf("host%d", i))}})
s.ID = uint64(i)
m.AddSeries(s)
}
if exp, got := 100000, len(m.SeriesKeys()); exp != got {
b.Fatalf("series count mismatch: exp %v got %v", exp, got)
}
stmt, err := influxql.NewParser(strings.NewReader(`SELECT * FROM cpu WHERE host !~ /foo\d+/`)).ParseStatement()
if err != nil {
b.Fatalf("invalid statement: %s", err)
}
selectStmt := stmt.(*influxql.SelectStatement)
b.ResetTimer()
for i := 0; i < b.N; i++ {
ids := m.IDsForExpr(selectStmt.Condition.(*influxql.BinaryExpr))
if exp, got := 100000, len(ids); exp != got {
b.Fatalf("series count mismatch: exp %v got %v", exp, got)
}
}
}
func benchmarkTagSets(b *testing.B, n int, opt influxql.IteratorOptions) {
m := inmem.NewMeasurement("foo", "m")
for i := 0; i < n; i++ {
tags := map[string]string{"tag1": "value1", "tag2": "value2"}
s := inmem.NewSeries([]byte(fmt.Sprintf("m,tag1=value1,tag2=value2")), models.NewTags(tags))
s.ID = uint64(i)
s.AssignShard(0)
m.AddSeries(s)
}
// warm caches
m.TagSets(0, opt)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
m.TagSets(0, opt)
}
}
func BenchmarkMeasurement_TagSetsNoDimensions_1000(b *testing.B) {
benchmarkTagSets(b, 1000, influxql.IteratorOptions{})
}
func BenchmarkMeasurement_TagSetsDimensions_1000(b *testing.B) {
benchmarkTagSets(b, 1000, influxql.IteratorOptions{Dimensions: []string{"tag1", "tag2"}})
}
func BenchmarkMeasurement_TagSetsNoDimensions_100000(b *testing.B) {
benchmarkTagSets(b, 100000, influxql.IteratorOptions{})
}
func BenchmarkMeasurement_TagSetsDimensions_100000(b *testing.B) {
benchmarkTagSets(b, 100000, influxql.IteratorOptions{Dimensions: []string{"tag1", "tag2"}})
}

View File

@@ -0,0 +1,71 @@
package internal
import (
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/bloom"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// File is a mock implementation of a tsi1.File.
type File struct {
Closef func() error
Pathf func() string
IDf func() int
Levelf func() int
Measurementf func(name []byte) tsi1.MeasurementElem
MeasurementIteratorf func() tsi1.MeasurementIterator
HasSeriesf func(name []byte, tags models.Tags, buf []byte) (exists, tombstoned bool)
Seriesf func(name []byte, tags models.Tags) tsi1.SeriesElem
SeriesNf func() uint64
TagKeyf func(name, key []byte) tsi1.TagKeyElem
TagKeyIteratorf func(name []byte) tsi1.TagKeyIterator
TagValuef func(name, key, value []byte) tsi1.TagValueElem
TagValueIteratorf func(name, key []byte) tsi1.TagValueIterator
SeriesIteratorf func() tsi1.SeriesIterator
MeasurementSeriesIteratorf func(name []byte) tsi1.SeriesIterator
TagKeySeriesIteratorf func(name, key []byte) tsi1.SeriesIterator
TagValueSeriesIteratorf func(name, key, value []byte) tsi1.SeriesIterator
MergeSeriesSketchesf func(s, t estimator.Sketch) error
MergeMeasurementsSketchesf func(s, t estimator.Sketch) error
Retainf func()
Releasef func()
Filterf func() *bloom.Filter
}
func (f *File) Close() error { return f.Closef() }
func (f *File) Path() string { return f.Pathf() }
func (f *File) ID() int { return f.IDf() }
func (f *File) Level() int { return f.Levelf() }
func (f *File) Measurement(name []byte) tsi1.MeasurementElem { return f.Measurementf(name) }
func (f *File) MeasurementIterator() tsi1.MeasurementIterator { return f.MeasurementIteratorf() }
func (f *File) HasSeries(name []byte, tags models.Tags, buf []byte) (exists, tombstoned bool) {
return f.HasSeriesf(name, tags, buf)
}
func (f *File) Series(name []byte, tags models.Tags) tsi1.SeriesElem { return f.Seriesf(name, tags) }
func (f *File) SeriesN() uint64 { return f.SeriesNf() }
func (f *File) TagKey(name, key []byte) tsi1.TagKeyElem { return f.TagKeyf(name, key) }
func (f *File) TagKeyIterator(name []byte) tsi1.TagKeyIterator { return f.TagKeyIteratorf(name) }
func (f *File) TagValue(name, key, value []byte) tsi1.TagValueElem {
return f.TagValuef(name, key, value)
}
func (f *File) TagValueIterator(name, key []byte) tsi1.TagValueIterator {
return f.TagValueIteratorf(name, key)
}
func (f *File) SeriesIterator() tsi1.SeriesIterator { return f.SeriesIteratorf() }
func (f *File) MeasurementSeriesIterator(name []byte) tsi1.SeriesIterator {
return f.MeasurementSeriesIteratorf(name)
}
func (f *File) TagKeySeriesIterator(name, key []byte) tsi1.SeriesIterator {
return f.TagKeySeriesIteratorf(name, key)
}
func (f *File) TagValueSeriesIterator(name, key, value []byte) tsi1.SeriesIterator {
return f.TagValueSeriesIteratorf(name, key, value)
}
func (f *File) MergeSeriesSketches(s, t estimator.Sketch) error { return f.MergeSeriesSketchesf(s, t) }
func (f *File) MergeMeasurementsSketches(s, t estimator.Sketch) error {
return f.MergeMeasurementsSketchesf(s, t)
}
func (f *File) Retain() { f.Retainf() }
func (f *File) Release() { f.Releasef() }
func (f *File) Filter() *bloom.Filter { return f.Filterf() }

View File

@@ -0,0 +1,238 @@
/*
Package tsi1 provides a memory-mapped index implementation that supports
high cardinality series.
Overview
The top-level object in tsi1 is the Index. It is the primary access point from
the rest of the system. The Index is composed of LogFile and IndexFile objects.
Log files are small write-ahead log files that record new series immediately
in the order that they are received. The data within the file is indexed
in-memory so it can be quickly accessed. When the system is restarted, this log
file is replayed and the in-memory representation is rebuilt.
Index files also contain series information, however, they are highly indexed
so that reads can be performed quickly. Index files are built through a process
called compaction where a log file or multiple index files are merged together.
Operations
The index can perform many tasks related to series, measurement, & tag data.
All data is inserted by adding a series to the index. When adding a series,
the measurement, tag keys, and tag values are all extracted and indexed
separately.
Once a series has been added, it can be removed in several ways. First, the
individual series can be removed. Second, it can be removed as part of a bulk
operation by deleting the entire measurement.
The query engine needs to be able to look up series in a variety of ways such
as by measurement name, by tag value, or by using regular expressions. The
index provides an API to iterate over subsets of series and perform set
operations such as unions and intersections.
Log File Layout
The write-ahead file that series initially are inserted into simply appends
all new operations sequentially. It is simply composed of a series of log
entries. An entry contains a flag to specify the operation type, the measurement
name, the tag set, and a checksum.
┏━━━━━━━━━LogEntry━━━━━━━━━┓
┃ ┌──────────────────────┐ ┃
┃ │ Flag │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Measurement │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Key/Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Key/Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Key/Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Checksum │ ┃
┃ └──────────────────────┘ ┃
┗━━━━━━━━━━━━━━━━━━━━━━━━━━┛
When the log file is replayed, if the checksum is incorrect or the entry is
incomplete (because of a partially failed write) then the log is truncated.
Index File Layout
The index file is composed of 3 main block types: one series block, one or more
tag blocks, and one measurement block. At the end of the index file is a
trailer that records metadata such as the offsets to these blocks.
Series Block Layout
The series block stores raw series keys in sorted order. It also provides hash
indexes so that series can be looked up quickly. Hash indexes are inserted
periodically so that memory size is limited at write time. Once all the series
and hash indexes have been written then a list of index entries are written
so that hash indexes can be looked up via binary search.
The end of the block contains two HyperLogLog++ sketches which track the
estimated number of created series and deleted series. After the sketches is
a trailer which contains metadata about the block.
┏━━━━━━━SeriesBlock━━━━━━━━┓
┃ ┌──────────────────────┐ ┃
┃ │ Series Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Series Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Series Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ │ ┃
┃ │ Hash Index │ ┃
┃ │ │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Series Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Series Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Series Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ │ ┃
┃ │ Hash Index │ ┃
┃ │ │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Index Entries │ ┃
┃ ├──────────────────────┤ ┃
┃ │ HLL Sketches │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Trailer │ ┃
┃ └──────────────────────┘ ┃
┗━━━━━━━━━━━━━━━━━━━━━━━━━━┛
Tag Block Layout
After the series block is one or more tag blocks. One of these blocks exists
for every measurement in the index file. The block is structured as a sorted
list of values for each key and then a sorted list of keys. Each of these lists
has their own hash index for fast direct lookups.
┏━━━━━━━━Tag Block━━━━━━━━━┓
┃ ┌──────────────────────┐ ┃
┃ │ Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ │ ┃
┃ │ Hash Index │ ┃
┃ │ │ ┃
┃ └──────────────────────┘ ┃
┃ ┌──────────────────────┐ ┃
┃ │ Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Value │ ┃
┃ ├──────────────────────┤ ┃
┃ │ │ ┃
┃ │ Hash Index │ ┃
┃ │ │ ┃
┃ └──────────────────────┘ ┃
┃ ┌──────────────────────┐ ┃
┃ │ Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Key │ ┃
┃ ├──────────────────────┤ ┃
┃ │ │ ┃
┃ │ Hash Index │ ┃
┃ │ │ ┃
┃ └──────────────────────┘ ┃
┃ ┌──────────────────────┐ ┃
┃ │ Trailer │ ┃
┃ └──────────────────────┘ ┃
┗━━━━━━━━━━━━━━━━━━━━━━━━━━┛
Each entry for values contains a sorted list of offsets for series keys that use
that value. Series iterators can be built around a single tag key value or
multiple iterators can be merged with set operators such as union or
intersection.
Measurement block
The measurement block stores a sorted list of measurements, their associated
series offsets, and the offset to their tag block. This allows all series for
a measurement to be traversed quickly and it allows fast direct lookups of
measurements and their tags.
This block also contains HyperLogLog++ sketches for new and deleted
measurements.
┏━━━━Measurement Block━━━━━┓
┃ ┌──────────────────────┐ ┃
┃ │ Measurement │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Measurement │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Measurement │ ┃
┃ ├──────────────────────┤ ┃
┃ │ │ ┃
┃ │ Hash Index │ ┃
┃ │ │ ┃
┃ ├──────────────────────┤ ┃
┃ │ HLL Sketches │ ┃
┃ ├──────────────────────┤ ┃
┃ │ Trailer │ ┃
┃ └──────────────────────┘ ┃
┗━━━━━━━━━━━━━━━━━━━━━━━━━━┛
Manifest file
The index is simply an ordered set of log and index files. These files can be
merged together or rewritten but their order must always be the same. This is
because series, measurements, & tags can be marked as deleted (aka tombstoned)
and this action needs to be tracked in time order.
Whenever the set of active files is changed, a manifest file is written to
track the set. The manifest specifies the ordering of files and, on startup,
all files not in the manifest are removed from the index directory.
Compacting index files
Compaction is the process of taking files and merging them together into a
single file. There are two stages of compaction within TSI.
First, once log files exceed a size threshold then they are compacted into an
index file. This threshold is relatively small because log files must maintain
their index in the heap which TSI tries to avoid. Small log files are also very
quick to convert into an index file so this is done aggressively.
Second, once a contiguous set of index files exceed a factor (e.g. 10x) then
they are all merged together into a single index file and the old files are
discarded. Because all blocks are written in sorted order, the new index file
can be streamed and minimize memory use.
Concurrency
Index files are immutable so they do not require fine grained locks, however,
compactions require that we track which files are in use so they are not
discarded too soon. This is done by using reference counting with file sets.
A file set is simply an ordered list of index files. When the current file set
is obtained from the index, a counter is incremented to track its usage. Once
the user is done with the file set, it is released and the counter is
decremented. A file cannot be removed from the file system until this counter
returns to zero.
Besides the reference counting, there are no other locking mechanisms when
reading or writing index files. Log files, however, do require a lock whenever
they are accessed. This is another reason to minimize log file size.
*/
package tsi1

View File

@@ -0,0 +1,998 @@
package tsi1
import (
"bytes"
"errors"
"fmt"
"regexp"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/bloom"
"github.com/influxdata/influxdb/pkg/bytesutil"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/pkg/estimator/hll"
"github.com/influxdata/influxdb/tsdb"
)
// FileSet represents a collection of files.
type FileSet struct {
levels []CompactionLevel
files []File
filters []*bloom.Filter // per-level filters
}
// NewFileSet returns a new instance of FileSet.
func NewFileSet(levels []CompactionLevel, files []File) (*FileSet, error) {
fs := &FileSet{levels: levels, files: files}
if err := fs.buildFilters(); err != nil {
return nil, err
}
return fs, nil
}
// Close closes all the files in the file set.
func (p FileSet) Close() error {
var err error
for _, f := range p.files {
if e := f.Close(); e != nil && err == nil {
err = e
}
}
return err
}
// Retain adds a reference count to all files.
func (fs *FileSet) Retain() {
for _, f := range fs.files {
f.Retain()
}
}
// Release removes a reference count from all files.
func (fs *FileSet) Release() {
for _, f := range fs.files {
f.Release()
}
}
// Prepend returns a new file set with f added at the beginning.
func (fs *FileSet) Prepend(f File) (*FileSet, error) {
return NewFileSet(fs.levels, append([]File{f}, fs.files...))
}
// MustReplace swaps a list of files for a single file and returns a new file set.
// The caller should always guarentee that the files exist and are contiguous.
func (fs *FileSet) MustReplace(oldFiles []File, newFile File) *FileSet {
assert(len(oldFiles) > 0, "cannot replace empty files")
// Find index of first old file.
var i int
for ; i < len(fs.files); i++ {
if fs.files[i] == oldFiles[0] {
break
} else if i == len(fs.files)-1 {
panic("first replacement file not found")
}
}
// Ensure all old files are contiguous.
for j := range oldFiles {
if fs.files[i+j] != oldFiles[j] {
panic(fmt.Sprintf("cannot replace non-contiguous files: subset=%+v, fileset=%+v", Files(oldFiles).IDs(), Files(fs.files).IDs()))
}
}
// Copy to new fileset.
other := make([]File, len(fs.files)-len(oldFiles)+1)
copy(other[:i], fs.files[:i])
other[i] = newFile
copy(other[i+1:], fs.files[i+len(oldFiles):])
fs, err := NewFileSet(fs.levels, other)
if err != nil {
panic("cannot build file set: " + err.Error())
}
return fs
}
// MaxID returns the highest file identifier.
func (fs *FileSet) MaxID() int {
var max int
for _, f := range fs.files {
if i := f.ID(); i > max {
max = i
}
}
return max
}
// Files returns all files in the set.
func (fs *FileSet) Files() []File {
return fs.files
}
// LogFiles returns all log files from the file set.
func (fs *FileSet) LogFiles() []*LogFile {
var a []*LogFile
for _, f := range fs.files {
if f, ok := f.(*LogFile); ok {
a = append(a, f)
}
}
return a
}
// IndexFiles returns all index files from the file set.
func (fs *FileSet) IndexFiles() []*IndexFile {
var a []*IndexFile
for _, f := range fs.files {
if f, ok := f.(*IndexFile); ok {
a = append(a, f)
}
}
return a
}
// LastContiguousIndexFilesByLevel returns the last contiguous files by level.
// These can be used by the compaction scheduler.
func (fs *FileSet) LastContiguousIndexFilesByLevel(level int) []*IndexFile {
if level == 0 {
return nil
}
var a []*IndexFile
for i := len(fs.files) - 1; i >= 0; i-- {
f := fs.files[i]
// Ignore files above level, stop on files below level.
if level < f.Level() {
continue
} else if level > f.Level() {
break
}
a = append([]*IndexFile{f.(*IndexFile)}, a...)
}
return a
}
// SeriesIterator returns an iterator over all series in the index.
func (fs *FileSet) SeriesIterator() SeriesIterator {
a := make([]SeriesIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.SeriesIterator()
if itr == nil {
continue
}
a = append(a, itr)
}
return FilterUndeletedSeriesIterator(MergeSeriesIterators(a...))
}
// Measurement returns a measurement by name.
func (fs *FileSet) Measurement(name []byte) MeasurementElem {
for _, f := range fs.files {
if e := f.Measurement(name); e == nil {
continue
} else if e.Deleted() {
return nil
} else {
return e
}
}
return nil
}
// MeasurementIterator returns an iterator over all measurements in the index.
func (fs *FileSet) MeasurementIterator() MeasurementIterator {
a := make([]MeasurementIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.MeasurementIterator()
if itr != nil {
a = append(a, itr)
}
}
return FilterUndeletedMeasurementIterator(MergeMeasurementIterators(a...))
}
// MeasurementSeriesIterator returns an iterator over all non-tombstoned series
// in the index for the provided measurement.
func (fs *FileSet) MeasurementSeriesIterator(name []byte) SeriesIterator {
a := make([]SeriesIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.MeasurementSeriesIterator(name)
if itr != nil {
a = append(a, itr)
}
}
return FilterUndeletedSeriesIterator(MergeSeriesIterators(a...))
}
// TagKeyIterator returns an iterator over all tag keys for a measurement.
func (fs *FileSet) TagKeyIterator(name []byte) TagKeyIterator {
a := make([]TagKeyIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.TagKeyIterator(name)
if itr != nil {
a = append(a, itr)
}
}
return MergeTagKeyIterators(a...)
}
// MeasurementTagKeysByExpr extracts the tag keys wanted by the expression.
func (fs *FileSet) MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error) {
switch e := expr.(type) {
case *influxql.BinaryExpr:
switch e.Op {
case influxql.EQ, influxql.NEQ, influxql.EQREGEX, influxql.NEQREGEX:
tag, ok := e.LHS.(*influxql.VarRef)
if !ok {
return nil, fmt.Errorf("left side of '%s' must be a tag key", e.Op.String())
} else if tag.Val != "_tagKey" {
return nil, nil
}
if influxql.IsRegexOp(e.Op) {
re, ok := e.RHS.(*influxql.RegexLiteral)
if !ok {
return nil, fmt.Errorf("right side of '%s' must be a regular expression", e.Op.String())
}
return fs.tagKeysByFilter(name, e.Op, nil, re.Val), nil
}
s, ok := e.RHS.(*influxql.StringLiteral)
if !ok {
return nil, fmt.Errorf("right side of '%s' must be a tag value string", e.Op.String())
}
return fs.tagKeysByFilter(name, e.Op, []byte(s.Val), nil), nil
case influxql.AND, influxql.OR:
lhs, err := fs.MeasurementTagKeysByExpr(name, e.LHS)
if err != nil {
return nil, err
}
rhs, err := fs.MeasurementTagKeysByExpr(name, e.RHS)
if err != nil {
return nil, err
}
if lhs != nil && rhs != nil {
if e.Op == influxql.OR {
return unionStringSets(lhs, rhs), nil
}
return intersectStringSets(lhs, rhs), nil
} else if lhs != nil {
return lhs, nil
} else if rhs != nil {
return rhs, nil
}
return nil, nil
default:
return nil, fmt.Errorf("invalid operator")
}
case *influxql.ParenExpr:
return fs.MeasurementTagKeysByExpr(name, e.Expr)
}
return nil, fmt.Errorf("%#v", expr)
}
// tagValuesByKeyAndExpr retrieves tag values for the provided tag keys.
//
// tagValuesByKeyAndExpr returns sets of values for each key, indexable by the
// position of the tag key in the keys argument.
//
// N.B tagValuesByKeyAndExpr relies on keys being sorted in ascending
// lexicographic order.
func (fs *FileSet) tagValuesByKeyAndExpr(name []byte, keys []string, expr influxql.Expr, fieldset *tsdb.MeasurementFieldSet) ([]map[string]struct{}, error) {
itr, err := fs.seriesByExprIterator(name, expr, fieldset.Fields(string(name)))
if err != nil {
return nil, err
} else if itr == nil {
return nil, nil
}
keyIdxs := make(map[string]int, len(keys))
for ki, key := range keys {
keyIdxs[key] = ki
// Check that keys are in order.
if ki > 0 && key < keys[ki-1] {
return nil, fmt.Errorf("keys %v are not in ascending order", keys)
}
}
resultSet := make([]map[string]struct{}, len(keys))
for i := 0; i < len(resultSet); i++ {
resultSet[i] = make(map[string]struct{})
}
// Iterate all series to collect tag values.
for e := itr.Next(); e != nil; e = itr.Next() {
for _, t := range e.Tags() {
if idx, ok := keyIdxs[string(t.Key)]; ok {
resultSet[idx][string(t.Value)] = struct{}{}
} else if string(t.Key) > keys[len(keys)-1] {
// The tag key is > the largest key we're interested in.
break
}
}
}
return resultSet, nil
}
// tagKeysByFilter will filter the tag keys for the measurement.
func (fs *FileSet) tagKeysByFilter(name []byte, op influxql.Token, val []byte, regex *regexp.Regexp) map[string]struct{} {
ss := make(map[string]struct{})
itr := fs.TagKeyIterator(name)
for e := itr.Next(); e != nil; e = itr.Next() {
var matched bool
switch op {
case influxql.EQ:
matched = bytes.Equal(e.Key(), val)
case influxql.NEQ:
matched = !bytes.Equal(e.Key(), val)
case influxql.EQREGEX:
matched = regex.Match(e.Key())
case influxql.NEQREGEX:
matched = !regex.Match(e.Key())
}
if !matched {
continue
}
ss[string(e.Key())] = struct{}{}
}
return ss
}
// TagKeySeriesIterator returns a series iterator for all values across a single key.
func (fs *FileSet) TagKeySeriesIterator(name, key []byte) SeriesIterator {
a := make([]SeriesIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.TagKeySeriesIterator(name, key)
if itr != nil {
a = append(a, itr)
}
}
return FilterUndeletedSeriesIterator(MergeSeriesIterators(a...))
}
// HasTagKey returns true if the tag key exists.
func (fs *FileSet) HasTagKey(name, key []byte) bool {
for _, f := range fs.files {
if e := f.TagKey(name, key); e != nil {
return !e.Deleted()
}
}
return false
}
// HasTagValue returns true if the tag value exists.
func (fs *FileSet) HasTagValue(name, key, value []byte) bool {
for _, f := range fs.files {
if e := f.TagValue(name, key, value); e != nil {
return !e.Deleted()
}
}
return false
}
// TagValueIterator returns a value iterator for a tag key.
func (fs *FileSet) TagValueIterator(name, key []byte) TagValueIterator {
a := make([]TagValueIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.TagValueIterator(name, key)
if itr != nil {
a = append(a, itr)
}
}
return MergeTagValueIterators(a...)
}
// TagValueSeriesIterator returns a series iterator for a single tag value.
func (fs *FileSet) TagValueSeriesIterator(name, key, value []byte) SeriesIterator {
a := make([]SeriesIterator, 0, len(fs.files))
for _, f := range fs.files {
itr := f.TagValueSeriesIterator(name, key, value)
if itr != nil {
a = append(a, itr)
}
}
return FilterUndeletedSeriesIterator(MergeSeriesIterators(a...))
}
// MatchTagValueSeriesIterator returns a series iterator for tags which match value.
// If matches is false, returns iterators which do not match value.
func (fs *FileSet) MatchTagValueSeriesIterator(name, key []byte, value *regexp.Regexp, matches bool) SeriesIterator {
matchEmpty := value.MatchString("")
if matches {
if matchEmpty {
return FilterUndeletedSeriesIterator(fs.matchTagValueEqualEmptySeriesIterator(name, key, value))
}
return FilterUndeletedSeriesIterator(fs.matchTagValueEqualNotEmptySeriesIterator(name, key, value))
}
if matchEmpty {
return FilterUndeletedSeriesIterator(fs.matchTagValueNotEqualEmptySeriesIterator(name, key, value))
}
return FilterUndeletedSeriesIterator(fs.matchTagValueNotEqualNotEmptySeriesIterator(name, key, value))
}
func (fs *FileSet) matchTagValueEqualEmptySeriesIterator(name, key []byte, value *regexp.Regexp) SeriesIterator {
vitr := fs.TagValueIterator(name, key)
if vitr == nil {
return fs.MeasurementSeriesIterator(name)
}
var itrs []SeriesIterator
for e := vitr.Next(); e != nil; e = vitr.Next() {
if !value.Match(e.Value()) {
itrs = append(itrs, fs.TagValueSeriesIterator(name, key, e.Value()))
}
}
return DifferenceSeriesIterators(
fs.MeasurementSeriesIterator(name),
MergeSeriesIterators(itrs...),
)
}
func (fs *FileSet) matchTagValueEqualNotEmptySeriesIterator(name, key []byte, value *regexp.Regexp) SeriesIterator {
vitr := fs.TagValueIterator(name, key)
if vitr == nil {
return nil
}
var itrs []SeriesIterator
for e := vitr.Next(); e != nil; e = vitr.Next() {
if value.Match(e.Value()) {
itrs = append(itrs, fs.TagValueSeriesIterator(name, key, e.Value()))
}
}
return MergeSeriesIterators(itrs...)
}
func (fs *FileSet) matchTagValueNotEqualEmptySeriesIterator(name, key []byte, value *regexp.Regexp) SeriesIterator {
vitr := fs.TagValueIterator(name, key)
if vitr == nil {
return nil
}
var itrs []SeriesIterator
for e := vitr.Next(); e != nil; e = vitr.Next() {
if !value.Match(e.Value()) {
itrs = append(itrs, fs.TagValueSeriesIterator(name, key, e.Value()))
}
}
return MergeSeriesIterators(itrs...)
}
func (fs *FileSet) matchTagValueNotEqualNotEmptySeriesIterator(name, key []byte, value *regexp.Regexp) SeriesIterator {
vitr := fs.TagValueIterator(name, key)
if vitr == nil {
return fs.MeasurementSeriesIterator(name)
}
var itrs []SeriesIterator
for e := vitr.Next(); e != nil; e = vitr.Next() {
if value.Match(e.Value()) {
itrs = append(itrs, fs.TagValueSeriesIterator(name, key, e.Value()))
}
}
return DifferenceSeriesIterators(
fs.MeasurementSeriesIterator(name),
MergeSeriesIterators(itrs...),
)
}
func (fs *FileSet) MeasurementNamesByExpr(expr influxql.Expr) ([][]byte, error) {
// Return filtered list if expression exists.
if expr != nil {
return fs.measurementNamesByExpr(expr)
}
// Iterate over all measurements if no condition exists.
var names [][]byte
itr := fs.MeasurementIterator()
for e := itr.Next(); e != nil; e = itr.Next() {
names = append(names, e.Name())
}
return names, nil
}
func (fs *FileSet) measurementNamesByExpr(expr influxql.Expr) ([][]byte, error) {
if expr == nil {
return nil, nil
}
switch e := expr.(type) {
case *influxql.BinaryExpr:
switch e.Op {
case influxql.EQ, influxql.NEQ, influxql.EQREGEX, influxql.NEQREGEX:
tag, ok := e.LHS.(*influxql.VarRef)
if !ok {
return nil, fmt.Errorf("left side of '%s' must be a tag key", e.Op.String())
}
// Retrieve value or regex expression from RHS.
var value string
var regex *regexp.Regexp
if influxql.IsRegexOp(e.Op) {
re, ok := e.RHS.(*influxql.RegexLiteral)
if !ok {
return nil, fmt.Errorf("right side of '%s' must be a regular expression", e.Op.String())
}
regex = re.Val
} else {
s, ok := e.RHS.(*influxql.StringLiteral)
if !ok {
return nil, fmt.Errorf("right side of '%s' must be a tag value string", e.Op.String())
}
value = s.Val
}
// Match on name, if specified.
if tag.Val == "_name" {
return fs.measurementNamesByNameFilter(e.Op, value, regex), nil
} else if influxql.IsSystemName(tag.Val) {
return nil, nil
}
return fs.measurementNamesByTagFilter(e.Op, tag.Val, value, regex), nil
case influxql.OR, influxql.AND:
lhs, err := fs.measurementNamesByExpr(e.LHS)
if err != nil {
return nil, err
}
rhs, err := fs.measurementNamesByExpr(e.RHS)
if err != nil {
return nil, err
}
if e.Op == influxql.OR {
return bytesutil.Union(lhs, rhs), nil
}
return bytesutil.Intersect(lhs, rhs), nil
default:
return nil, fmt.Errorf("invalid tag comparison operator")
}
case *influxql.ParenExpr:
return fs.measurementNamesByExpr(e.Expr)
default:
return nil, fmt.Errorf("%#v", expr)
}
}
// measurementNamesByNameFilter returns matching measurement names in sorted order.
func (fs *FileSet) measurementNamesByNameFilter(op influxql.Token, val string, regex *regexp.Regexp) [][]byte {
var names [][]byte
itr := fs.MeasurementIterator()
for e := itr.Next(); e != nil; e = itr.Next() {
var matched bool
switch op {
case influxql.EQ:
matched = string(e.Name()) == val
case influxql.NEQ:
matched = string(e.Name()) != val
case influxql.EQREGEX:
matched = regex.Match(e.Name())
case influxql.NEQREGEX:
matched = !regex.Match(e.Name())
}
if matched {
names = append(names, e.Name())
}
}
bytesutil.Sort(names)
return names
}
func (fs *FileSet) measurementNamesByTagFilter(op influxql.Token, key, val string, regex *regexp.Regexp) [][]byte {
var names [][]byte
mitr := fs.MeasurementIterator()
for me := mitr.Next(); me != nil; me = mitr.Next() {
// If the operator is non-regex, only check the specified value.
var tagMatch bool
if op == influxql.EQ || op == influxql.NEQ {
if fs.HasTagValue(me.Name(), []byte(key), []byte(val)) {
tagMatch = true
}
} else {
// Else, the operator is a regex and we have to check all tag
// values against the regular expression.
vitr := fs.TagValueIterator(me.Name(), []byte(key))
if vitr != nil {
for ve := vitr.Next(); ve != nil; ve = vitr.Next() {
if regex.Match(ve.Value()) {
tagMatch = true
break
}
}
}
}
//
// XNOR gate
//
// tags match | operation is EQ | measurement matches
// --------------------------------------------------
// True | True | True
// True | False | False
// False | True | False
// False | False | True
if tagMatch == (op == influxql.EQ || op == influxql.EQREGEX) {
names = append(names, me.Name())
continue
}
}
bytesutil.Sort(names)
return names
}
// HasSeries returns true if the series exists and is not tombstoned.
func (fs *FileSet) HasSeries(name []byte, tags models.Tags, buf []byte) bool {
for _, f := range fs.files {
if exists, tombstoned := f.HasSeries(name, tags, buf); exists {
return !tombstoned
}
}
return false
}
// FilterNamesTags filters out any series which already exist. It modifies the
// provided slices of names and tags.
func (fs *FileSet) FilterNamesTags(names [][]byte, tagsSlice []models.Tags) ([][]byte, []models.Tags) {
buf := make([]byte, 4096)
// Filter across all log files.
// Log files obtain a read lock and should be done in bulk for performance.
for _, f := range fs.LogFiles() {
names, tagsSlice = f.FilterNamesTags(names, tagsSlice)
}
// Filter across remaining index files.
indexFiles := fs.IndexFiles()
newNames, newTagsSlice := names[:0], tagsSlice[:0]
for i := range names {
name, tags := names[i], tagsSlice[i]
currentLevel, skipLevel := -1, false
var exists, tombstoned bool
for j := 0; j < len(indexFiles); j++ {
f := indexFiles[j]
// Check for existence on the level when it changes.
if level := f.Level(); currentLevel != level {
currentLevel, skipLevel = level, false
if filter := fs.filters[level]; filter != nil {
if !filter.Contains(AppendSeriesKey(buf[:0], name, tags)) {
skipLevel = true
}
}
}
// Skip file if in level where it doesn't exist.
if skipLevel {
continue
}
// Stop once we find the series in a file.
if exists, tombstoned = f.HasSeries(name, tags, buf); exists {
break
}
}
// If the series doesn't exist or it has been tombstoned then add it.
if !exists || tombstoned {
newNames = append(newNames, name)
newTagsSlice = append(newTagsSlice, tags)
}
}
return newNames, newTagsSlice
}
// SeriesSketches returns the merged series sketches for the FileSet.
func (fs *FileSet) SeriesSketches() (estimator.Sketch, estimator.Sketch, error) {
sketch, tsketch := hll.NewDefaultPlus(), hll.NewDefaultPlus()
// Iterate over all the files and merge the sketches into the result.
for _, f := range fs.files {
if err := f.MergeSeriesSketches(sketch, tsketch); err != nil {
return nil, nil, err
}
}
return sketch, tsketch, nil
}
// MeasurementsSketches returns the merged measurement sketches for the FileSet.
func (fs *FileSet) MeasurementsSketches() (estimator.Sketch, estimator.Sketch, error) {
sketch, tsketch := hll.NewDefaultPlus(), hll.NewDefaultPlus()
// Iterate over all the files and merge the sketches into the result.
for _, f := range fs.files {
if err := f.MergeMeasurementsSketches(sketch, tsketch); err != nil {
return nil, nil, err
}
}
return sketch, tsketch, nil
}
// MeasurementSeriesByExprIterator returns a series iterator for a measurement
// that is filtered by expr. If expr only contains time expressions then this
// call is equivalent to MeasurementSeriesIterator().
func (fs *FileSet) MeasurementSeriesByExprIterator(name []byte, expr influxql.Expr, fieldset *tsdb.MeasurementFieldSet) (SeriesIterator, error) {
// Return all series for the measurement if there are no tag expressions.
if expr == nil || influxql.OnlyTimeExpr(expr) {
return fs.MeasurementSeriesIterator(name), nil
}
return fs.seriesByExprIterator(name, expr, fieldset.CreateFieldsIfNotExists(name))
}
// MeasurementSeriesKeysByExpr returns a list of series keys matching expr.
func (fs *FileSet) MeasurementSeriesKeysByExpr(name []byte, expr influxql.Expr, fieldset *tsdb.MeasurementFieldSet) ([][]byte, error) {
// Create iterator for all matching series.
itr, err := fs.MeasurementSeriesByExprIterator(name, expr, fieldset)
if err != nil {
return nil, err
} else if itr == nil {
return nil, nil
}
// Iterate over all series and generate keys.
var keys [][]byte
for e := itr.Next(); e != nil; e = itr.Next() {
// Check for unsupported field filters.
// Any remaining filters means there were fields (e.g., `WHERE value = 1.2`).
if e.Expr() != nil {
return nil, errors.New("fields not supported in WHERE clause during deletion")
}
keys = append(keys, models.MakeKey(e.Name(), e.Tags()))
}
return keys, nil
}
func (fs *FileSet) seriesByExprIterator(name []byte, expr influxql.Expr, mf *tsdb.MeasurementFields) (SeriesIterator, error) {
switch expr := expr.(type) {
case *influxql.BinaryExpr:
switch expr.Op {
case influxql.AND, influxql.OR:
// Get the series IDs and filter expressions for the LHS.
litr, err := fs.seriesByExprIterator(name, expr.LHS, mf)
if err != nil {
return nil, err
}
// Get the series IDs and filter expressions for the RHS.
ritr, err := fs.seriesByExprIterator(name, expr.RHS, mf)
if err != nil {
return nil, err
}
// Intersect iterators if expression is "AND".
if expr.Op == influxql.AND {
return IntersectSeriesIterators(litr, ritr), nil
}
// Union iterators if expression is "OR".
return UnionSeriesIterators(litr, ritr), nil
default:
return fs.seriesByBinaryExprIterator(name, expr, mf)
}
case *influxql.ParenExpr:
return fs.seriesByExprIterator(name, expr.Expr, mf)
default:
return nil, nil
}
}
// seriesByBinaryExprIterator returns a series iterator and a filtering expression.
func (fs *FileSet) seriesByBinaryExprIterator(name []byte, n *influxql.BinaryExpr, mf *tsdb.MeasurementFields) (SeriesIterator, error) {
// If this binary expression has another binary expression, then this
// is some expression math and we should just pass it to the underlying query.
if _, ok := n.LHS.(*influxql.BinaryExpr); ok {
return newSeriesExprIterator(fs.MeasurementSeriesIterator(name), n), nil
} else if _, ok := n.RHS.(*influxql.BinaryExpr); ok {
return newSeriesExprIterator(fs.MeasurementSeriesIterator(name), n), nil
}
// Retrieve the variable reference from the correct side of the expression.
key, ok := n.LHS.(*influxql.VarRef)
value := n.RHS
if !ok {
key, ok = n.RHS.(*influxql.VarRef)
if !ok {
return nil, fmt.Errorf("invalid expression: %s", n.String())
}
value = n.LHS
}
// For time literals, return all series and "true" as the filter.
if _, ok := value.(*influxql.TimeLiteral); ok || key.Val == "time" {
return newSeriesExprIterator(fs.MeasurementSeriesIterator(name), &influxql.BooleanLiteral{Val: true}), nil
}
// For fields, return all series from this measurement.
if key.Val != "_name" && ((key.Type == influxql.Unknown && mf.HasField(key.Val)) || key.Type == influxql.AnyField || (key.Type != influxql.Tag && key.Type != influxql.Unknown)) {
return newSeriesExprIterator(fs.MeasurementSeriesIterator(name), n), nil
} else if value, ok := value.(*influxql.VarRef); ok {
// Check if the RHS is a variable and if it is a field.
if value.Val != "_name" && ((value.Type == influxql.Unknown && mf.HasField(value.Val)) || key.Type == influxql.AnyField || (value.Type != influxql.Tag && value.Type != influxql.Unknown)) {
return newSeriesExprIterator(fs.MeasurementSeriesIterator(name), n), nil
}
}
// Create iterator based on value type.
switch value := value.(type) {
case *influxql.StringLiteral:
return fs.seriesByBinaryExprStringIterator(name, []byte(key.Val), []byte(value.Val), n.Op)
case *influxql.RegexLiteral:
return fs.seriesByBinaryExprRegexIterator(name, []byte(key.Val), value.Val, n.Op)
case *influxql.VarRef:
return fs.seriesByBinaryExprVarRefIterator(name, []byte(key.Val), value, n.Op)
default:
if n.Op == influxql.NEQ || n.Op == influxql.NEQREGEX {
return fs.MeasurementSeriesIterator(name), nil
}
return nil, nil
}
}
func (fs *FileSet) seriesByBinaryExprStringIterator(name, key, value []byte, op influxql.Token) (SeriesIterator, error) {
// Special handling for "_name" to match measurement name.
if bytes.Equal(key, []byte("_name")) {
if (op == influxql.EQ && bytes.Equal(value, name)) || (op == influxql.NEQ && !bytes.Equal(value, name)) {
return fs.MeasurementSeriesIterator(name), nil
}
return nil, nil
}
if op == influxql.EQ {
// Match a specific value.
if len(value) != 0 {
return fs.TagValueSeriesIterator(name, key, value), nil
}
// Return all measurement series that have no values from this tag key.
return DifferenceSeriesIterators(
fs.MeasurementSeriesIterator(name),
fs.TagKeySeriesIterator(name, key),
), nil
}
// Return all measurement series without this tag value.
if len(value) != 0 {
return DifferenceSeriesIterators(
fs.MeasurementSeriesIterator(name),
fs.TagValueSeriesIterator(name, key, value),
), nil
}
// Return all series across all values of this tag key.
return fs.TagKeySeriesIterator(name, key), nil
}
func (fs *FileSet) seriesByBinaryExprRegexIterator(name, key []byte, value *regexp.Regexp, op influxql.Token) (SeriesIterator, error) {
// Special handling for "_name" to match measurement name.
if bytes.Equal(key, []byte("_name")) {
match := value.Match(name)
if (op == influxql.EQREGEX && match) || (op == influxql.NEQREGEX && !match) {
return newSeriesExprIterator(fs.MeasurementSeriesIterator(name), &influxql.BooleanLiteral{Val: true}), nil
}
return nil, nil
}
return fs.MatchTagValueSeriesIterator(name, key, value, op == influxql.EQREGEX), nil
}
func (fs *FileSet) seriesByBinaryExprVarRefIterator(name, key []byte, value *influxql.VarRef, op influxql.Token) (SeriesIterator, error) {
if op == influxql.EQ {
return IntersectSeriesIterators(
fs.TagKeySeriesIterator(name, key),
fs.TagKeySeriesIterator(name, []byte(value.Val)),
), nil
}
return DifferenceSeriesIterators(
fs.TagKeySeriesIterator(name, key),
fs.TagKeySeriesIterator(name, []byte(value.Val)),
), nil
}
// buildFilters builds a series existence filter for each compaction level.
func (fs *FileSet) buildFilters() error {
if len(fs.levels) == 0 {
fs.filters = nil
return nil
}
// Generate filters for each level.
fs.filters = make([]*bloom.Filter, len(fs.levels))
// Merge filters at each level.
for _, f := range fs.files {
level := f.Level()
// Skip if file has no bloom filter.
if f.Filter() == nil {
continue
}
// Initialize a filter if it doesn't exist.
if fs.filters[level] == nil {
lvl := fs.levels[level]
fs.filters[level] = bloom.NewFilter(lvl.M, lvl.K)
}
// Merge filter.
if err := fs.filters[level].Merge(f.Filter()); err != nil {
return err
}
}
return nil
}
// File represents a log or index file.
type File interface {
Close() error
Path() string
ID() int
Level() int
Measurement(name []byte) MeasurementElem
MeasurementIterator() MeasurementIterator
HasSeries(name []byte, tags models.Tags, buf []byte) (exists, tombstoned bool)
Series(name []byte, tags models.Tags) SeriesElem
SeriesN() uint64
TagKey(name, key []byte) TagKeyElem
TagKeyIterator(name []byte) TagKeyIterator
TagValue(name, key, value []byte) TagValueElem
TagValueIterator(name, key []byte) TagValueIterator
// Series iteration.
SeriesIterator() SeriesIterator
MeasurementSeriesIterator(name []byte) SeriesIterator
TagKeySeriesIterator(name, key []byte) SeriesIterator
TagValueSeriesIterator(name, key, value []byte) SeriesIterator
// Sketches for cardinality estimation
MergeSeriesSketches(s, t estimator.Sketch) error
MergeMeasurementsSketches(s, t estimator.Sketch) error
// Series existence bloom filter.
Filter() *bloom.Filter
// Reference counting.
Retain()
Release()
}
type Files []File
func (a Files) IDs() []int {
ids := make([]int, len(a))
for i := range a {
ids[i] = a[i].ID()
}
return ids
}

View File

@@ -0,0 +1,324 @@
package tsi1_test
import (
"fmt"
"testing"
"github.com/influxdata/influxdb/models"
)
// Ensure fileset can return an iterator over all series in the index.
func TestFileSet_SeriesIterator(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Create initial set of series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
}); err != nil {
t.Fatal(err)
}
// Verify initial set of series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.SeriesIterator()
if itr == nil {
t.Fatal("expected iterator")
}
if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region east}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region west}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `mem` || e.Tags().String() != `[{region east}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil series: %s/%s", e.Name(), e.Tags().String())
}
})
// Add more series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("disk")},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "north"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
}); err != nil {
t.Fatal(err)
}
// Verify additional series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.SeriesIterator()
if itr == nil {
t.Fatal("expected iterator")
}
if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region east}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region north}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region west}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `disk` || len(e.Tags()) != 0 {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `mem` || e.Tags().String() != `[{region east}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil series: %s/%s", e.Name(), e.Tags().String())
}
})
}
// Ensure fileset can return an iterator over all series for one measurement.
func TestFileSet_MeasurementSeriesIterator(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Create initial set of series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
}); err != nil {
t.Fatal(err)
}
// Verify initial set of series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.MeasurementSeriesIterator([]byte("cpu"))
if itr == nil {
t.Fatal("expected iterator")
}
if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region east}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region west}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil series: %s/%s", e.Name(), e.Tags().String())
}
})
// Add more series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("disk")},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "north"})},
}); err != nil {
t.Fatal(err)
}
// Verify additional series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.MeasurementSeriesIterator([]byte("cpu"))
if itr == nil {
t.Fatalf("expected iterator")
}
if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region east}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region north}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); string(e.Name()) != `cpu` || e.Tags().String() != `[{region west}]` {
t.Fatalf("unexpected series: %s/%s", e.Name(), e.Tags().String())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil series: %s/%s", e.Name(), e.Tags().String())
}
})
}
// Ensure fileset can return an iterator over all measurements for the index.
func TestFileSet_MeasurementIterator(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Create initial set of series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu")},
{Name: []byte("mem")},
}); err != nil {
t.Fatal(err)
}
// Verify initial set of series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.MeasurementIterator()
if itr == nil {
t.Fatal("expected iterator")
}
if e := itr.Next(); string(e.Name()) != `cpu` {
t.Fatalf("unexpected measurement: %s", e.Name())
} else if e := itr.Next(); string(e.Name()) != `mem` {
t.Fatalf("unexpected measurement: %s", e.Name())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil measurement: %s", e.Name())
}
})
// Add more series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("disk"), Tags: models.NewTags(map[string]string{"foo": "bar"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "north", "x": "y"})},
}); err != nil {
t.Fatal(err)
}
// Verify additional series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.MeasurementIterator()
if itr == nil {
t.Fatal("expected iterator")
}
if e := itr.Next(); string(e.Name()) != `cpu` {
t.Fatalf("unexpected measurement: %s", e.Name())
} else if e := itr.Next(); string(e.Name()) != `disk` {
t.Fatalf("unexpected measurement: %s", e.Name())
} else if e := itr.Next(); string(e.Name()) != `mem` {
t.Fatalf("unexpected measurement: %s", e.Name())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil measurement: %s", e.Name())
}
})
}
// Ensure fileset can return an iterator over all keys for one measurement.
func TestFileSet_TagKeyIterator(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Create initial set of series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west", "type": "gpu"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east", "misc": "other"})},
}); err != nil {
t.Fatal(err)
}
// Verify initial set of series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.TagKeyIterator([]byte("cpu"))
if itr == nil {
t.Fatalf("expected iterator")
}
if e := itr.Next(); string(e.Key()) != `region` {
t.Fatalf("unexpected key: %s", e.Key())
} else if e := itr.Next(); string(e.Key()) != `type` {
t.Fatalf("unexpected key: %s", e.Key())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil key: %s", e.Key())
}
})
// Add more series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("disk"), Tags: models.NewTags(map[string]string{"foo": "bar"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "north", "x": "y"})},
}); err != nil {
t.Fatal(err)
}
// Verify additional series.
idx.Run(t, func(t *testing.T) {
fs := idx.RetainFileSet()
defer fs.Release()
itr := fs.TagKeyIterator([]byte("cpu"))
if itr == nil {
t.Fatal("expected iterator")
}
if e := itr.Next(); string(e.Key()) != `region` {
t.Fatalf("unexpected key: %s", e.Key())
} else if e := itr.Next(); string(e.Key()) != `type` {
t.Fatalf("unexpected key: %s", e.Key())
} else if e := itr.Next(); string(e.Key()) != `x` {
t.Fatalf("unexpected key: %s", e.Key())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil key: %s", e.Key())
}
})
}
var (
byteSliceResult [][]byte
tagsSliceResult []models.Tags
)
func BenchmarkFileset_FilterNamesTags(b *testing.B) {
idx := MustOpenIndex()
defer idx.Close()
allNames := make([][]byte, 0, 2000*1000)
allTags := make([]models.Tags, 0, 2000*1000)
for i := 0; i < 2000; i++ {
for j := 0; j < 1000; j++ {
name := []byte(fmt.Sprintf("measurement-%d", i))
tags := models.NewTags(map[string]string{"host": fmt.Sprintf("server-%d", j)})
allNames = append(allNames, name)
allTags = append(allTags, tags)
}
}
if err := idx.CreateSeriesListIfNotExists(nil, allNames, allTags); err != nil {
b.Fatal(err)
}
// idx.CheckFastCompaction()
fs := idx.RetainFileSet()
defer fs.Release()
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
names := [][]byte{
[]byte("foo"),
[]byte("measurement-222"), // filtered
[]byte("measurement-222"), // kept (tags won't match)
[]byte("measurements-1"),
[]byte("measurement-900"), // filtered
[]byte("measurement-44444"),
[]byte("bar"),
}
tags := []models.Tags{
nil,
models.NewTags(map[string]string{"host": "server-297"}), // filtered
models.NewTags(map[string]string{"host": "wrong"}),
nil,
models.NewTags(map[string]string{"host": "server-1026"}), // filtered
models.NewTags(map[string]string{"host": "server-23"}), // kept (measurement won't match)
models.NewTags(map[string]string{"host": "zoo"}),
}
b.StartTimer()
byteSliceResult, tagsSliceResult = fs.FilterNamesTags(names, tags)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,439 @@
package tsi1
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"sync"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/bloom"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/pkg/mmap"
)
// IndexFileVersion is the current TSI1 index file version.
const IndexFileVersion = 1
// FileSignature represents a magic number at the header of the index file.
const FileSignature = "TSI1"
// IndexFile field size constants.
const (
// IndexFile trailer fields
IndexFileVersionSize = 2
SeriesBlockOffsetSize = 8
SeriesBlockSizeSize = 8
MeasurementBlockOffsetSize = 8
MeasurementBlockSizeSize = 8
IndexFileTrailerSize = IndexFileVersionSize +
SeriesBlockOffsetSize +
SeriesBlockSizeSize +
MeasurementBlockOffsetSize +
MeasurementBlockSizeSize
)
// IndexFile errors.
var (
ErrInvalidIndexFile = errors.New("invalid index file")
ErrUnsupportedIndexFileVersion = errors.New("unsupported index file version")
)
// IndexFile represents a collection of measurement, tag, and series data.
type IndexFile struct {
wg sync.WaitGroup // ref count
data []byte
// Components
sblk SeriesBlock
tblks map[string]*TagBlock // tag blocks by measurement name
mblk MeasurementBlock
// Sortable identifier & filepath to the log file.
level int
id int
// Counters
seriesN int64 // Number of unique series in this indexFile.
// Compaction tracking.
mu sync.RWMutex
compacting bool
// Path to data file.
path string
}
// NewIndexFile returns a new instance of IndexFile.
func NewIndexFile() *IndexFile {
return &IndexFile{}
}
// Open memory maps the data file at the file's path.
func (f *IndexFile) Open() error {
// Extract identifier from path name.
f.id, f.level = ParseFilename(f.Path())
data, err := mmap.Map(f.Path())
if err != nil {
return err
}
return f.UnmarshalBinary(data)
}
// Close unmaps the data file.
func (f *IndexFile) Close() error {
// Wait until all references are released.
f.wg.Wait()
f.sblk = SeriesBlock{}
f.tblks = nil
f.mblk = MeasurementBlock{}
f.seriesN = 0
return mmap.Unmap(f.data)
}
// ID returns the file sequence identifier.
func (f *IndexFile) ID() int { return f.id }
// Path returns the file path.
func (f *IndexFile) Path() string { return f.path }
// SetPath sets the file's path.
func (f *IndexFile) SetPath(path string) { f.path = path }
// Level returns the compaction level for the file.
func (f *IndexFile) Level() int { return f.level }
// Filter returns the series existence filter for the file.
func (f *IndexFile) Filter() *bloom.Filter { return f.sblk.filter }
// Retain adds a reference count to the file.
func (f *IndexFile) Retain() { f.wg.Add(1) }
// Release removes a reference count from the file.
func (f *IndexFile) Release() { f.wg.Done() }
// Size returns the size of the index file, in bytes.
func (f *IndexFile) Size() int64 { return int64(len(f.data)) }
// Compacting returns true if the file is being compacted.
func (f *IndexFile) Compacting() bool {
f.mu.RLock()
v := f.compacting
f.mu.RUnlock()
return v
}
// setCompacting sets whether the index file is being compacted.
func (f *IndexFile) setCompacting(v bool) {
f.mu.Lock()
f.compacting = v
f.mu.Unlock()
}
// UnmarshalBinary opens an index from data.
// The byte slice is retained so it must be kept open.
func (f *IndexFile) UnmarshalBinary(data []byte) error {
// Ensure magic number exists at the beginning.
if len(data) < len(FileSignature) {
return io.ErrShortBuffer
} else if !bytes.Equal(data[:len(FileSignature)], []byte(FileSignature)) {
return ErrInvalidIndexFile
}
// Read index file trailer.
t, err := ReadIndexFileTrailer(data)
if err != nil {
return err
}
// Slice measurement block data.
buf := data[t.MeasurementBlock.Offset:]
buf = buf[:t.MeasurementBlock.Size]
// Unmarshal measurement block.
if err := f.mblk.UnmarshalBinary(buf); err != nil {
return err
}
// Unmarshal each tag block.
f.tblks = make(map[string]*TagBlock)
itr := f.mblk.Iterator()
for m := itr.Next(); m != nil; m = itr.Next() {
e := m.(*MeasurementBlockElem)
// Slice measurement block data.
buf := data[e.tagBlock.offset:]
buf = buf[:e.tagBlock.size]
// Unmarshal measurement block.
var tblk TagBlock
if err := tblk.UnmarshalBinary(buf); err != nil {
return err
}
f.tblks[string(e.name)] = &tblk
}
// Slice series list data.
buf = data[t.SeriesBlock.Offset:]
buf = buf[:t.SeriesBlock.Size]
// Unmarshal series list.
if err := f.sblk.UnmarshalBinary(buf); err != nil {
return err
}
// Save reference to entire data block.
f.data = data
return nil
}
// Measurement returns a measurement element.
func (f *IndexFile) Measurement(name []byte) MeasurementElem {
e, ok := f.mblk.Elem(name)
if !ok {
return nil
}
return &e
}
// MeasurementN returns the number of measurements in the file.
func (f *IndexFile) MeasurementN() (n uint64) {
mitr := f.mblk.Iterator()
for me := mitr.Next(); me != nil; me = mitr.Next() {
n++
}
return n
}
// TagValueIterator returns a value iterator for a tag key and a flag
// indicating if a tombstone exists on the measurement or key.
func (f *IndexFile) TagValueIterator(name, key []byte) TagValueIterator {
tblk := f.tblks[string(name)]
if tblk == nil {
return nil
}
// Find key element.
ke := tblk.TagKeyElem(key)
if ke == nil {
return nil
}
// Merge all value series iterators together.
return ke.TagValueIterator()
}
// TagKeySeriesIterator returns a series iterator for a tag key and a flag
// indicating if a tombstone exists on the measurement or key.
func (f *IndexFile) TagKeySeriesIterator(name, key []byte) SeriesIterator {
tblk := f.tblks[string(name)]
if tblk == nil {
return nil
}
// Find key element.
ke := tblk.TagKeyElem(key)
if ke == nil {
return nil
}
// Merge all value series iterators together.
vitr := ke.TagValueIterator()
var itrs []SeriesIterator
for ve := vitr.Next(); ve != nil; ve = vitr.Next() {
sitr := &rawSeriesIDIterator{data: ve.(*TagBlockValueElem).series.data}
itrs = append(itrs, newSeriesDecodeIterator(&f.sblk, sitr))
}
return MergeSeriesIterators(itrs...)
}
// TagValueSeriesIterator returns a series iterator for a tag value and a flag
// indicating if a tombstone exists on the measurement, key, or value.
func (f *IndexFile) TagValueSeriesIterator(name, key, value []byte) SeriesIterator {
tblk := f.tblks[string(name)]
if tblk == nil {
return nil
}
// Find value element.
ve := tblk.TagValueElem(key, value)
if ve == nil {
return nil
}
// Create an iterator over value's series.
return newSeriesDecodeIterator(
&f.sblk,
&rawSeriesIDIterator{
n: ve.(*TagBlockValueElem).series.n,
data: ve.(*TagBlockValueElem).series.data,
},
)
}
// TagKey returns a tag key.
func (f *IndexFile) TagKey(name, key []byte) TagKeyElem {
tblk := f.tblks[string(name)]
if tblk == nil {
return nil
}
return tblk.TagKeyElem(key)
}
// TagValue returns a tag value.
func (f *IndexFile) TagValue(name, key, value []byte) TagValueElem {
tblk := f.tblks[string(name)]
if tblk == nil {
return nil
}
return tblk.TagValueElem(key, value)
}
// HasSeries returns flags indicating if the series exists and if it is tombstoned.
func (f *IndexFile) HasSeries(name []byte, tags models.Tags, buf []byte) (exists, tombstoned bool) {
return f.sblk.HasSeries(name, tags, buf)
}
// Series returns the series and a flag indicating if the series has been
// tombstoned by the measurement.
func (f *IndexFile) Series(name []byte, tags models.Tags) SeriesElem {
return f.sblk.Series(name, tags)
}
// TagValueElem returns an element for a measurement/tag/value.
func (f *IndexFile) TagValueElem(name, key, value []byte) TagValueElem {
tblk, ok := f.tblks[string(name)]
if !ok {
return nil
}
return tblk.TagValueElem(key, value)
}
// MeasurementIterator returns an iterator over all measurements.
func (f *IndexFile) MeasurementIterator() MeasurementIterator {
return f.mblk.Iterator()
}
// TagKeyIterator returns an iterator over all tag keys for a measurement.
func (f *IndexFile) TagKeyIterator(name []byte) TagKeyIterator {
blk := f.tblks[string(name)]
if blk == nil {
return nil
}
return blk.TagKeyIterator()
}
// MeasurementSeriesIterator returns an iterator over a measurement's series.
func (f *IndexFile) MeasurementSeriesIterator(name []byte) SeriesIterator {
return &seriesDecodeIterator{
itr: f.mblk.seriesIDIterator(name),
sblk: &f.sblk,
}
}
// MergeMeasurementsSketches merges the index file's series sketches into the provided
// sketches.
func (f *IndexFile) MergeMeasurementsSketches(s, t estimator.Sketch) error {
if err := s.Merge(f.mblk.sketch); err != nil {
return err
}
return t.Merge(f.mblk.tSketch)
}
// SeriesN returns the total number of non-tombstoned series for the index file.
func (f *IndexFile) SeriesN() uint64 {
return uint64(f.sblk.seriesN - f.sblk.tombstoneN)
}
// SeriesIterator returns an iterator over all series.
func (f *IndexFile) SeriesIterator() SeriesIterator {
return f.sblk.SeriesIterator()
}
// MergeSeriesSketches merges the index file's series sketches into the provided
// sketches.
func (f *IndexFile) MergeSeriesSketches(s, t estimator.Sketch) error {
if err := s.Merge(f.sblk.sketch); err != nil {
return err
}
return t.Merge(f.sblk.tsketch)
}
// ReadIndexFileTrailer returns the index file trailer from data.
func ReadIndexFileTrailer(data []byte) (IndexFileTrailer, error) {
var t IndexFileTrailer
// Read version.
t.Version = int(binary.BigEndian.Uint16(data[len(data)-IndexFileVersionSize:]))
if t.Version != IndexFileVersion {
return t, ErrUnsupportedIndexFileVersion
}
// Slice trailer data.
buf := data[len(data)-IndexFileTrailerSize:]
// Read series list info.
t.SeriesBlock.Offset = int64(binary.BigEndian.Uint64(buf[0:SeriesBlockOffsetSize]))
buf = buf[SeriesBlockOffsetSize:]
t.SeriesBlock.Size = int64(binary.BigEndian.Uint64(buf[0:SeriesBlockSizeSize]))
buf = buf[SeriesBlockSizeSize:]
// Read measurement block info.
t.MeasurementBlock.Offset = int64(binary.BigEndian.Uint64(buf[0:MeasurementBlockOffsetSize]))
buf = buf[MeasurementBlockOffsetSize:]
t.MeasurementBlock.Size = int64(binary.BigEndian.Uint64(buf[0:MeasurementBlockSizeSize]))
buf = buf[MeasurementBlockSizeSize:]
return t, nil
}
// IndexFileTrailer represents meta data written to the end of the index file.
type IndexFileTrailer struct {
Version int
SeriesBlock struct {
Offset int64
Size int64
}
MeasurementBlock struct {
Offset int64
Size int64
}
}
// WriteTo writes the trailer to w.
func (t *IndexFileTrailer) WriteTo(w io.Writer) (n int64, err error) {
// Write series list info.
if err := writeUint64To(w, uint64(t.SeriesBlock.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.SeriesBlock.Size), &n); err != nil {
return n, err
}
// Write measurement block info.
if err := writeUint64To(w, uint64(t.MeasurementBlock.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.MeasurementBlock.Size), &n); err != nil {
return n, err
}
// Write index file encoding version.
if err := writeUint16To(w, IndexFileVersion, &n); err != nil {
return n, err
}
return n, nil
}
// FormatIndexFileName generates an index filename for the given index.
func FormatIndexFileName(id, level int) string {
return fmt.Sprintf("L%d-%08d%s", level, id, IndexFileExt)
}

View File

@@ -0,0 +1,154 @@
package tsi1_test
import (
"bytes"
"testing"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Ensure a simple index file can be built and opened.
func TestCreateIndexFile(t *testing.T) {
f, err := CreateIndexFile([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
})
if err != nil {
t.Fatal(err)
}
if e := f.TagValueElem([]byte("cpu"), []byte("region"), []byte("west")); e == nil {
t.Fatal("expected element")
} else if n := e.(*tsi1.TagBlockValueElem).SeriesN(); n != 1 {
t.Fatalf("unexpected series count: %d", n)
}
}
// Ensure index file generation can be successfully built.
func TestGenerateIndexFile(t *testing.T) {
// Build generated index file.
f, err := GenerateIndexFile(10, 3, 4)
if err != nil {
t.Fatal(err)
}
// Verify that tag/value series can be fetched.
if e := f.TagValueElem([]byte("measurement0"), []byte("key0"), []byte("value0")); e == nil {
t.Fatal("expected element")
} else if n := e.(*tsi1.TagBlockValueElem).SeriesN(); n == 0 {
t.Fatal("expected series")
}
}
func BenchmarkIndexFile_TagValueSeries(b *testing.B) {
b.Run("M=1,K=2,V=3", func(b *testing.B) {
benchmarkIndexFile_TagValueSeries(b, MustFindOrGenerateIndexFile(1, 2, 3))
})
b.Run("M=10,K=5,V=5", func(b *testing.B) {
benchmarkIndexFile_TagValueSeries(b, MustFindOrGenerateIndexFile(10, 5, 5))
})
b.Run("M=10,K=7,V=5", func(b *testing.B) {
benchmarkIndexFile_TagValueSeries(b, MustFindOrGenerateIndexFile(10, 7, 7))
})
}
func benchmarkIndexFile_TagValueSeries(b *testing.B, idx *tsi1.IndexFile) {
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
if e := idx.TagValueElem([]byte("measurement0"), []byte("key0"), []byte("value0")); e == nil {
b.Fatal("expected element")
} else if e.(*tsi1.TagBlockValueElem).SeriesN() == 0 {
b.Fatal("expected series")
}
}
}
// CreateIndexFile creates an index file with a given set of series.
func CreateIndexFile(series []Series) (*tsi1.IndexFile, error) {
lf, err := CreateLogFile(series)
if err != nil {
return nil, err
}
// Write index file to buffer.
var buf bytes.Buffer
if _, err := lf.CompactTo(&buf, M, K); err != nil {
return nil, err
}
// Load index file from buffer.
var f tsi1.IndexFile
if err := f.UnmarshalBinary(buf.Bytes()); err != nil {
return nil, err
}
return &f, nil
}
// GenerateIndexFile generates an index file from a set of series based on the count arguments.
// Total series returned will equal measurementN * tagN * valueN.
func GenerateIndexFile(measurementN, tagN, valueN int) (*tsi1.IndexFile, error) {
// Generate a new log file first.
lf, err := GenerateLogFile(measurementN, tagN, valueN)
if err != nil {
return nil, err
}
// Compact log file to buffer.
var buf bytes.Buffer
if _, err := lf.CompactTo(&buf, M, K); err != nil {
return nil, err
}
// Load index file from buffer.
var f tsi1.IndexFile
if err := f.UnmarshalBinary(buf.Bytes()); err != nil {
return nil, err
}
return &f, nil
}
func MustGenerateIndexFile(measurementN, tagN, valueN int) *tsi1.IndexFile {
f, err := GenerateIndexFile(measurementN, tagN, valueN)
if err != nil {
panic(err)
}
return f
}
var indexFileCache struct {
MeasurementN int
TagN int
ValueN int
IndexFile *tsi1.IndexFile
}
// MustFindOrGenerateIndexFile returns a cached index file or generates one if it doesn't exist.
func MustFindOrGenerateIndexFile(measurementN, tagN, valueN int) *tsi1.IndexFile {
// Use cache if fields match and the index file has been generated.
if indexFileCache.MeasurementN == measurementN &&
indexFileCache.TagN == tagN &&
indexFileCache.ValueN == valueN &&
indexFileCache.IndexFile != nil {
return indexFileCache.IndexFile
}
// Generate and cache.
indexFileCache.MeasurementN = measurementN
indexFileCache.TagN = tagN
indexFileCache.ValueN = valueN
indexFileCache.IndexFile = MustGenerateIndexFile(measurementN, tagN, valueN)
return indexFileCache.IndexFile
}
func pow(x, y int) int {
r := 1
for i := 0; i < y; i++ {
r *= x
}
return r
}

View File

@@ -0,0 +1,362 @@
package tsi1
import (
"bufio"
"fmt"
"io"
"os"
"sort"
"time"
"github.com/influxdata/influxdb/pkg/estimator/hll"
"github.com/influxdata/influxdb/pkg/mmap"
)
// IndexFiles represents a layered set of index files.
type IndexFiles []*IndexFile
// IDs returns the ids for all index files.
func (p IndexFiles) IDs() []int {
a := make([]int, len(p))
for i, f := range p {
a[i] = f.ID()
}
return a
}
// Retain adds a reference count to all files.
func (p IndexFiles) Retain() {
for _, f := range p {
f.Retain()
}
}
// Release removes a reference count from all files.
func (p IndexFiles) Release() {
for _, f := range p {
f.Release()
}
}
// Files returns p as a list of File objects.
func (p IndexFiles) Files() []File {
other := make([]File, len(p))
for i, f := range p {
other[i] = f
}
return other
}
// MeasurementNames returns a sorted list of all measurement names for all files.
func (p *IndexFiles) MeasurementNames() [][]byte {
itr := p.MeasurementIterator()
var names [][]byte
for e := itr.Next(); e != nil; e = itr.Next() {
names = append(names, copyBytes(e.Name()))
}
sort.Sort(byteSlices(names))
return names
}
// MeasurementIterator returns an iterator that merges measurements across all files.
func (p IndexFiles) MeasurementIterator() MeasurementIterator {
a := make([]MeasurementIterator, 0, len(p))
for i := range p {
itr := p[i].MeasurementIterator()
if itr == nil {
continue
}
a = append(a, itr)
}
return MergeMeasurementIterators(a...)
}
// TagKeyIterator returns an iterator that merges tag keys across all files.
func (p *IndexFiles) TagKeyIterator(name []byte) (TagKeyIterator, error) {
a := make([]TagKeyIterator, 0, len(*p))
for _, f := range *p {
itr := f.TagKeyIterator(name)
if itr == nil {
continue
}
a = append(a, itr)
}
return MergeTagKeyIterators(a...), nil
}
// SeriesIterator returns an iterator that merges series across all files.
func (p IndexFiles) SeriesIterator() SeriesIterator {
a := make([]SeriesIterator, 0, len(p))
for _, f := range p {
itr := f.SeriesIterator()
if itr == nil {
continue
}
a = append(a, itr)
}
return MergeSeriesIterators(a...)
}
// MeasurementSeriesIterator returns an iterator that merges series across all files.
func (p IndexFiles) MeasurementSeriesIterator(name []byte) SeriesIterator {
a := make([]SeriesIterator, 0, len(p))
for _, f := range p {
itr := f.MeasurementSeriesIterator(name)
if itr == nil {
continue
}
a = append(a, itr)
}
return MergeSeriesIterators(a...)
}
// TagValueSeriesIterator returns an iterator that merges series across all files.
func (p IndexFiles) TagValueSeriesIterator(name, key, value []byte) SeriesIterator {
a := make([]SeriesIterator, 0, len(p))
for i := range p {
itr := p[i].TagValueSeriesIterator(name, key, value)
if itr != nil {
a = append(a, itr)
}
}
return MergeSeriesIterators(a...)
}
// CompactTo merges all index files and writes them to w.
func (p IndexFiles) CompactTo(w io.Writer, m, k uint64) (n int64, err error) {
var t IndexFileTrailer
// Wrap writer in buffered I/O.
bw := bufio.NewWriter(w)
// Setup context object to track shared data for this compaction.
var info indexCompactInfo
info.tagSets = make(map[string]indexTagSetPos)
// Write magic number.
if err := writeTo(bw, []byte(FileSignature), &n); err != nil {
return n, err
}
// Write combined series list.
t.SeriesBlock.Offset = n
if err := p.writeSeriesBlockTo(bw, m, k, &info, &n); err != nil {
return n, err
}
t.SeriesBlock.Size = n - t.SeriesBlock.Offset
// Flush buffer before re-mapping.
if err := bw.Flush(); err != nil {
return n, err
}
// Open series block as memory-mapped data.
sblk, data, err := mapIndexFileSeriesBlock(w)
if data != nil {
defer mmap.Unmap(data)
}
if err != nil {
return n, err
}
info.sblk = sblk
// Write tagset blocks in measurement order.
if err := p.writeTagsetsTo(bw, &info, &n); err != nil {
return n, err
}
// Write measurement block.
t.MeasurementBlock.Offset = n
if err := p.writeMeasurementBlockTo(bw, &info, &n); err != nil {
return n, err
}
t.MeasurementBlock.Size = n - t.MeasurementBlock.Offset
// Write trailer.
nn, err := t.WriteTo(bw)
n += nn
if err != nil {
return n, err
}
// Flush file.
if err := bw.Flush(); err != nil {
return n, err
}
return n, nil
}
func (p IndexFiles) writeSeriesBlockTo(w io.Writer, m, k uint64, info *indexCompactInfo, n *int64) error {
// Estimate series cardinality.
sketch := hll.NewDefaultPlus()
for _, f := range p {
if err := f.MergeSeriesSketches(sketch, sketch); err != nil {
return err
}
}
itr := p.SeriesIterator()
enc := NewSeriesBlockEncoder(w, uint32(sketch.Count()), m, k)
// Write all series.
for e := itr.Next(); e != nil; e = itr.Next() {
if err := enc.Encode(e.Name(), e.Tags(), e.Deleted()); err != nil {
return err
}
}
// Close and flush block.
err := enc.Close()
*n += int64(enc.N())
if err != nil {
return err
}
return nil
}
func (p IndexFiles) writeTagsetsTo(w io.Writer, info *indexCompactInfo, n *int64) error {
mitr := p.MeasurementIterator()
for m := mitr.Next(); m != nil; m = mitr.Next() {
if err := p.writeTagsetTo(w, m.Name(), info, n); err != nil {
return err
}
}
return nil
}
// writeTagsetTo writes a single tagset to w and saves the tagset offset.
func (p IndexFiles) writeTagsetTo(w io.Writer, name []byte, info *indexCompactInfo, n *int64) error {
var seriesKey []byte
kitr, err := p.TagKeyIterator(name)
if err != nil {
return err
}
enc := NewTagBlockEncoder(w)
for ke := kitr.Next(); ke != nil; ke = kitr.Next() {
// Encode key.
if err := enc.EncodeKey(ke.Key(), ke.Deleted()); err != nil {
return err
}
// Iterate over tag values.
vitr := ke.TagValueIterator()
for ve := vitr.Next(); ve != nil; ve = vitr.Next() {
// Merge all series together.
sitr := p.TagValueSeriesIterator(name, ke.Key(), ve.Value())
var seriesIDs []uint32
for se := sitr.Next(); se != nil; se = sitr.Next() {
seriesID, _ := info.sblk.Offset(se.Name(), se.Tags(), seriesKey[:0])
if seriesID == 0 {
return fmt.Errorf("expected series id: %s/%s", se.Name(), se.Tags().String())
}
seriesIDs = append(seriesIDs, seriesID)
}
sort.Sort(uint32Slice(seriesIDs))
// Encode value.
if err := enc.EncodeValue(ve.Value(), ve.Deleted(), seriesIDs); err != nil {
return err
}
}
}
// Save tagset offset to measurement.
pos := info.tagSets[string(name)]
pos.offset = *n
// Flush data to writer.
err = enc.Close()
*n += enc.N()
if err != nil {
return err
}
// Save tagset size to measurement.
pos.size = *n - pos.offset
info.tagSets[string(name)] = pos
return nil
}
func (p IndexFiles) writeMeasurementBlockTo(w io.Writer, info *indexCompactInfo, n *int64) error {
var seriesKey []byte
mw := NewMeasurementBlockWriter()
// Add measurement data & compute sketches.
mitr := p.MeasurementIterator()
for m := mitr.Next(); m != nil; m = mitr.Next() {
name := m.Name()
// Look-up series ids.
itr := p.MeasurementSeriesIterator(name)
var seriesIDs []uint32
for e := itr.Next(); e != nil; e = itr.Next() {
seriesID, _ := info.sblk.Offset(e.Name(), e.Tags(), seriesKey[:0])
if seriesID == 0 {
panic(fmt.Sprintf("expected series id: %s %s", e.Name(), e.Tags().String()))
}
seriesIDs = append(seriesIDs, seriesID)
}
sort.Sort(uint32Slice(seriesIDs))
// Add measurement to writer.
pos := info.tagSets[string(name)]
mw.Add(name, m.Deleted(), pos.offset, pos.size, seriesIDs)
}
// Flush data to writer.
nn, err := mw.WriteTo(w)
*n += nn
return err
}
// Stat returns the max index file size and the total file size for all index files.
func (p IndexFiles) Stat() (*IndexFilesInfo, error) {
var info IndexFilesInfo
for _, f := range p {
fi, err := os.Stat(f.Path())
if os.IsNotExist(err) {
continue
} else if err != nil {
return nil, err
}
if fi.Size() > info.MaxSize {
info.MaxSize = fi.Size()
}
if fi.ModTime().After(info.ModTime) {
info.ModTime = fi.ModTime()
}
info.Size += fi.Size()
}
return &info, nil
}
type IndexFilesInfo struct {
MaxSize int64 // largest file size
Size int64 // total file size
ModTime time.Time // last modified
}
// indexCompactInfo is a context object used for tracking position information
// during the compaction of index files.
type indexCompactInfo struct {
// Memory-mapped series block.
// Available after the series block has been written.
sblk *SeriesBlock
// Tracks offset/size for each measurement's tagset.
tagSets map[string]indexTagSetPos
}
// indexTagSetPos stores the offset/size of tagsets.
type indexTagSetPos struct {
offset int64
size int64
}

View File

@@ -0,0 +1,53 @@
package tsi1_test
import (
"bytes"
"testing"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Ensure multiple index files can be compacted together.
func TestIndexFiles_WriteTo(t *testing.T) {
// Write first file.
f0, err := CreateIndexFile([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
})
if err != nil {
t.Fatal(err)
}
// Write second file.
f1, err := CreateIndexFile([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("disk"), Tags: models.NewTags(map[string]string{"region": "east"})},
})
if err != nil {
t.Fatal(err)
}
// Compact the two together and write out to a buffer.
var buf bytes.Buffer
a := tsi1.IndexFiles{f0, f1}
if n, err := a.CompactTo(&buf, M, K); err != nil {
t.Fatal(err)
} else if n == 0 {
t.Fatal("expected data written")
}
// Unmarshal buffer into a new index file.
var f tsi1.IndexFile
if err := f.UnmarshalBinary(buf.Bytes()); err != nil {
t.Fatal(err)
}
// Verify data in compacted file.
if e := f.TagValueElem([]byte("cpu"), []byte("region"), []byte("west")); e == nil {
t.Fatal("expected element")
} else if n := e.(*tsi1.TagBlockValueElem).SeriesN(); n != 1 {
t.Fatalf("unexpected series count: %d", n)
}
}

View File

@@ -0,0 +1,329 @@
package tsi1_test
import (
"fmt"
"os"
"reflect"
"regexp"
"testing"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Bloom filter settings used in tests.
const M, K = 4096, 6
// Ensure index can iterate over all measurement names.
func TestIndex_ForEachMeasurementName(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Add series to index.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
}); err != nil {
t.Fatal(err)
}
// Verify measurements are returned.
idx.Run(t, func(t *testing.T) {
var names []string
if err := idx.ForEachMeasurementName(func(name []byte) error {
names = append(names, string(name))
return nil
}); err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(names, []string{"cpu", "mem"}) {
t.Fatalf("unexpected names: %#v", names)
}
})
// Add more series.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("disk")},
{Name: []byte("mem")},
}); err != nil {
t.Fatal(err)
}
// Verify new measurements.
idx.Run(t, func(t *testing.T) {
var names []string
if err := idx.ForEachMeasurementName(func(name []byte) error {
names = append(names, string(name))
return nil
}); err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(names, []string{"cpu", "disk", "mem"}) {
t.Fatalf("unexpected names: %#v", names)
}
})
}
// Ensure index can return whether a measurement exists.
func TestIndex_MeasurementExists(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Add series to index.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
}); err != nil {
t.Fatal(err)
}
// Verify measurement exists.
idx.Run(t, func(t *testing.T) {
if v, err := idx.MeasurementExists([]byte("cpu")); err != nil {
t.Fatal(err)
} else if !v {
t.Fatal("expected measurement to exist")
}
})
// Delete one series.
if err := idx.DropSeries(models.MakeKey([]byte("cpu"), models.NewTags(map[string]string{"region": "east"}))); err != nil {
t.Fatal(err)
}
// Verify measurement still exists.
idx.Run(t, func(t *testing.T) {
if v, err := idx.MeasurementExists([]byte("cpu")); err != nil {
t.Fatal(err)
} else if !v {
t.Fatal("expected measurement to still exist")
}
})
// Delete second series.
if err := idx.DropSeries(models.MakeKey([]byte("cpu"), models.NewTags(map[string]string{"region": "west"}))); err != nil {
t.Fatal(err)
}
// Verify measurement is now deleted.
idx.Run(t, func(t *testing.T) {
if v, err := idx.MeasurementExists([]byte("cpu")); err != nil {
t.Fatal(err)
} else if v {
t.Fatal("expected measurement to be deleted")
}
})
}
// Ensure index can return a list of matching measurements.
func TestIndex_MeasurementNamesByExpr(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Add series to index.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("disk"), Tags: models.NewTags(map[string]string{"region": "north"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "west", "country": "us"})},
}); err != nil {
t.Fatal(err)
}
// Retrieve measurements by expression
idx.Run(t, func(t *testing.T) {
t.Run("EQ", func(t *testing.T) {
names, err := idx.MeasurementNamesByExpr(influxql.MustParseExpr(`region = 'west'`))
if err != nil {
t.Fatal(err)
} else if !reflect.DeepEqual(names, [][]byte{[]byte("cpu"), []byte("mem")}) {
t.Fatalf("unexpected names: %v", names)
}
})
t.Run("NEQ", func(t *testing.T) {
names, err := idx.MeasurementNamesByExpr(influxql.MustParseExpr(`region != 'east'`))
if err != nil {
t.Fatal(err)
} else if !reflect.DeepEqual(names, [][]byte{[]byte("disk"), []byte("mem")}) {
t.Fatalf("unexpected names: %v", names)
}
})
t.Run("EQREGEX", func(t *testing.T) {
names, err := idx.MeasurementNamesByExpr(influxql.MustParseExpr(`region =~ /east|west/`))
if err != nil {
t.Fatal(err)
} else if !reflect.DeepEqual(names, [][]byte{[]byte("cpu"), []byte("mem")}) {
t.Fatalf("unexpected names: %v", names)
}
})
t.Run("NEQREGEX", func(t *testing.T) {
names, err := idx.MeasurementNamesByExpr(influxql.MustParseExpr(`country !~ /^u/`))
if err != nil {
t.Fatal(err)
} else if !reflect.DeepEqual(names, [][]byte{[]byte("cpu"), []byte("disk")}) {
t.Fatalf("unexpected names: %v", names)
}
})
})
}
// Ensure index can return a list of matching measurements.
func TestIndex_MeasurementNamesByRegex(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Add series to index.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu")},
{Name: []byte("disk")},
{Name: []byte("mem")},
}); err != nil {
t.Fatal(err)
}
// Retrieve measurements by regex.
idx.Run(t, func(t *testing.T) {
names, err := idx.MeasurementNamesByRegex(regexp.MustCompile(`cpu|mem`))
if err != nil {
t.Fatal(err)
} else if !reflect.DeepEqual(names, [][]byte{[]byte("cpu"), []byte("mem")}) {
t.Fatalf("unexpected names: %v", names)
}
})
}
// Ensure index can delete a measurement and all related keys, values, & series.
func TestIndex_DropMeasurement(t *testing.T) {
idx := MustOpenIndex()
defer idx.Close()
// Add series to index.
if err := idx.CreateSeriesSliceIfNotExists([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("disk"), Tags: models.NewTags(map[string]string{"region": "north"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "west", "country": "us"})},
}); err != nil {
t.Fatal(err)
}
// Drop measurement.
if err := idx.DropMeasurement([]byte("cpu")); err != nil {
t.Fatal(err)
}
// Verify data is gone in each stage.
idx.Run(t, func(t *testing.T) {
// Verify measurement is gone.
if v, err := idx.MeasurementExists([]byte("cpu")); err != nil {
t.Fatal(err)
} else if v {
t.Fatal("expected no measurement")
}
// Obtain file set to perform lower level checks.
fs := idx.RetainFileSet()
defer fs.Release()
// Verify tags & values are gone.
if e := fs.TagKeyIterator([]byte("cpu")).Next(); e != nil && !e.Deleted() {
t.Fatal("expected deleted tag key")
}
if itr := fs.TagValueIterator([]byte("cpu"), []byte("region")); itr != nil {
t.Fatal("expected nil tag value iterator")
}
})
}
// Index is a test wrapper for tsi1.Index.
type Index struct {
*tsi1.Index
}
// NewIndex returns a new instance of Index at a temporary path.
func NewIndex() *Index {
idx := &Index{Index: tsi1.NewIndex()}
idx.Path = MustTempDir()
return idx
}
// MustOpenIndex returns a new, open index. Panic on error.
func MustOpenIndex() *Index {
idx := NewIndex()
if err := idx.Open(); err != nil {
panic(err)
}
return idx
}
// Close closes and removes the index directory.
func (idx *Index) Close() error {
defer os.RemoveAll(idx.Path)
return idx.Index.Close()
}
// Reopen closes and opens the index.
func (idx *Index) Reopen() error {
if err := idx.Index.Close(); err != nil {
return err
}
path := idx.Path
idx.Index = tsi1.NewIndex()
idx.Path = path
if err := idx.Open(); err != nil {
return err
}
return nil
}
// Run executes a subtest for each of several different states:
//
// - Immediately
// - After reopen
// - After compaction
// - After reopen again
//
// The index should always respond in the same fashion regardless of
// how data is stored. This helper allows the index to be easily tested
// in all major states.
func (idx *Index) Run(t *testing.T, fn func(t *testing.T)) {
// Invoke immediately.
t.Run("state=initial", fn)
// Reopen and invoke again.
if err := idx.Reopen(); err != nil {
t.Fatalf("reopen error: %s", err)
}
t.Run("state=reopen", fn)
// TODO: Request a compaction.
// if err := idx.Compact(); err != nil {
// t.Fatalf("compact error: %s", err)
// }
// t.Run("state=post-compaction", fn)
// Reopen and invoke again.
if err := idx.Reopen(); err != nil {
t.Fatalf("post-compaction reopen error: %s", err)
}
t.Run("state=post-compaction-reopen", fn)
}
// CreateSeriesSliceIfNotExists creates multiple series at a time.
func (idx *Index) CreateSeriesSliceIfNotExists(a []Series) error {
for i, s := range a {
if err := idx.CreateSeriesIfNotExists(nil, s.Name, s.Tags); err != nil {
return fmt.Errorf("i=%d, name=%s, tags=%v, err=%s", i, s.Name, s.Tags, err)
}
}
return nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,339 @@
package tsi1_test
import (
"bytes"
"fmt"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"regexp"
"runtime/pprof"
"sort"
"testing"
"time"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/bloom"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Ensure log file can append series.
func TestLogFile_AddSeries(t *testing.T) {
f := MustOpenLogFile()
defer f.Close()
// Add test data.
if err := f.AddSeries([]byte("mem"), models.Tags{{Key: []byte("host"), Value: []byte("serverA")}}); err != nil {
t.Fatal(err)
} else if err := f.AddSeries([]byte("cpu"), models.Tags{{Key: []byte("region"), Value: []byte("us-east")}}); err != nil {
t.Fatal(err)
} else if err := f.AddSeries([]byte("cpu"), models.Tags{{Key: []byte("region"), Value: []byte("us-west")}}); err != nil {
t.Fatal(err)
}
// Verify data.
itr := f.MeasurementIterator()
if e := itr.Next(); e == nil || string(e.Name()) != "cpu" {
t.Fatalf("unexpected measurement: %#v", e)
} else if e := itr.Next(); e == nil || string(e.Name()) != "mem" {
t.Fatalf("unexpected measurement: %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected eof, got: %#v", e)
}
// Reopen file and re-verify.
if err := f.Reopen(); err != nil {
t.Fatal(err)
}
// Verify data.
itr = f.MeasurementIterator()
if e := itr.Next(); e == nil || string(e.Name()) != "cpu" {
t.Fatalf("unexpected measurement: %#v", e)
} else if e := itr.Next(); e == nil || string(e.Name()) != "mem" {
t.Fatalf("unexpected measurement: %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected eof, got: %#v", e)
}
}
func TestLogFile_SeriesStoredInOrder(t *testing.T) {
f := MustOpenLogFile()
defer f.Close()
// Generate and add test data
tvm := make(map[string]struct{})
rand.Seed(time.Now().Unix())
for i := 0; i < 100; i++ {
tv := fmt.Sprintf("server-%d", rand.Intn(50)) // Encourage adding duplicate series.
tvm[tv] = struct{}{}
if err := f.AddSeries([]byte("mem"), models.Tags{models.NewTag([]byte("host"), []byte(tv))}); err != nil {
t.Fatal(err)
}
if err := f.AddSeries([]byte("cpu"), models.Tags{models.NewTag([]byte("host"), []byte(tv))}); err != nil {
t.Fatal(err)
}
}
// Sort the tag values so we know what order to expect.
tvs := make([]string, 0, len(tvm))
for tv := range tvm {
tvs = append(tvs, tv)
}
sort.Strings(tvs)
// Double the series values since we're adding them twice (two measurements)
tvs = append(tvs, tvs...)
// When we pull the series out via an iterator they should be in order.
itr := f.SeriesIterator()
if itr == nil {
t.Fatal("nil iterator")
}
mname := []string{"cpu", "mem"}
var j int
for i := 0; i < len(tvs); i++ {
serie := itr.Next()
if serie == nil {
t.Fatal("got nil series")
}
if got, exp := string(serie.Name()), mname[j]; got != exp {
t.Fatalf("[series %d] got %s, expected %s", i, got, exp)
}
if got, exp := string(serie.Tags()[0].Value), tvs[i]; got != exp {
t.Fatalf("[series %d] got %s, expected %s", i, got, exp)
}
if i == (len(tvs)/2)-1 {
// Next measurement
j++
}
}
}
// Ensure log file can delete an existing measurement.
func TestLogFile_DeleteMeasurement(t *testing.T) {
f := MustOpenLogFile()
defer f.Close()
// Add test data.
if err := f.AddSeries([]byte("mem"), models.Tags{{Key: []byte("host"), Value: []byte("serverA")}}); err != nil {
t.Fatal(err)
} else if err := f.AddSeries([]byte("cpu"), models.Tags{{Key: []byte("region"), Value: []byte("us-east")}}); err != nil {
t.Fatal(err)
} else if err := f.AddSeries([]byte("cpu"), models.Tags{{Key: []byte("region"), Value: []byte("us-west")}}); err != nil {
t.Fatal(err)
}
// Remove measurement.
if err := f.DeleteMeasurement([]byte("cpu")); err != nil {
t.Fatal(err)
}
// Verify data.
itr := f.MeasurementIterator()
if e := itr.Next(); string(e.Name()) != "cpu" || !e.Deleted() {
t.Fatalf("unexpected measurement: %s/%v", e.Name(), e.Deleted())
} else if e := itr.Next(); string(e.Name()) != "mem" || e.Deleted() {
t.Fatalf("unexpected measurement: %s/%v", e.Name(), e.Deleted())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected eof, got: %#v", e)
}
}
// LogFile is a test wrapper for tsi1.LogFile.
type LogFile struct {
*tsi1.LogFile
}
// NewLogFile returns a new instance of LogFile with a temporary file path.
func NewLogFile() *LogFile {
file, err := ioutil.TempFile("", "tsi1-log-file-")
if err != nil {
panic(err)
}
file.Close()
return &LogFile{LogFile: tsi1.NewLogFile(file.Name())}
}
// MustOpenLogFile returns a new, open instance of LogFile. Panic on error.
func MustOpenLogFile() *LogFile {
f := NewLogFile()
if err := f.Open(); err != nil {
panic(err)
}
return f
}
// Close closes the log file and removes it from disk.
func (f *LogFile) Close() error {
defer os.Remove(f.Path())
return f.LogFile.Close()
}
// Reopen closes and reopens the file.
func (f *LogFile) Reopen() error {
if err := f.LogFile.Close(); err != nil {
return err
}
if err := f.LogFile.Open(); err != nil {
return err
}
return nil
}
// CreateLogFile creates a new temporary log file and adds a list of series.
func CreateLogFile(series []Series) (*LogFile, error) {
f := MustOpenLogFile()
for _, serie := range series {
if err := f.AddSeries(serie.Name, serie.Tags); err != nil {
return nil, err
}
}
return f, nil
}
// GenerateLogFile generates a log file from a set of series based on the count arguments.
// Total series returned will equal measurementN * tagN * valueN.
func GenerateLogFile(measurementN, tagN, valueN int) (*LogFile, error) {
tagValueN := pow(valueN, tagN)
f := MustOpenLogFile()
for i := 0; i < measurementN; i++ {
name := []byte(fmt.Sprintf("measurement%d", i))
// Generate tag sets.
for j := 0; j < tagValueN; j++ {
var tags models.Tags
for k := 0; k < tagN; k++ {
key := []byte(fmt.Sprintf("key%d", k))
value := []byte(fmt.Sprintf("value%d", (j / pow(valueN, k) % valueN)))
tags = append(tags, models.NewTag(key, value))
}
if err := f.AddSeries(name, tags); err != nil {
return nil, err
}
}
}
return f, nil
}
func MustGenerateLogFile(measurementN, tagN, valueN int) *LogFile {
f, err := GenerateLogFile(measurementN, tagN, valueN)
if err != nil {
panic(err)
}
return f
}
func benchmarkLogFile_AddSeries(b *testing.B, measurementN, seriesKeyN, seriesValueN int) {
b.StopTimer()
f := MustOpenLogFile()
type Datum struct {
Name []byte
Tags models.Tags
}
// Pre-generate everything.
var (
data []Datum
series int
)
tagValueN := pow(seriesValueN, seriesKeyN)
for i := 0; i < measurementN; i++ {
name := []byte(fmt.Sprintf("measurement%d", i))
for j := 0; j < tagValueN; j++ {
var tags models.Tags
for k := 0; k < seriesKeyN; k++ {
key := []byte(fmt.Sprintf("key%d", k))
value := []byte(fmt.Sprintf("value%d", (j / pow(seriesValueN, k) % seriesValueN)))
tags = append(tags, models.NewTag(key, value))
}
data = append(data, Datum{Name: name, Tags: tags})
series += len(tags)
}
}
b.StartTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, d := range data {
if err := f.AddSeries(d.Name, d.Tags); err != nil {
b.Fatal(err)
}
}
}
}
func BenchmarkLogFile_AddSeries_100_1_1(b *testing.B) { benchmarkLogFile_AddSeries(b, 100, 1, 1) } // 100 series
func BenchmarkLogFile_AddSeries_1000_1_1(b *testing.B) { benchmarkLogFile_AddSeries(b, 1000, 1, 1) } // 1000 series
func BenchmarkLogFile_AddSeries_10000_1_1(b *testing.B) { benchmarkLogFile_AddSeries(b, 10000, 1, 1) } // 10000 series
func BenchmarkLogFile_AddSeries_100_2_10(b *testing.B) { benchmarkLogFile_AddSeries(b, 100, 2, 10) } // ~20K series
func BenchmarkLogFile_AddSeries_100000_1_1(b *testing.B) { benchmarkLogFile_AddSeries(b, 100000, 1, 1) } // ~100K series
func BenchmarkLogFile_AddSeries_100_3_7(b *testing.B) { benchmarkLogFile_AddSeries(b, 100, 3, 7) } // ~100K series
func BenchmarkLogFile_AddSeries_200_3_7(b *testing.B) { benchmarkLogFile_AddSeries(b, 200, 3, 7) } // ~200K series
func BenchmarkLogFile_AddSeries_200_4_7(b *testing.B) { benchmarkLogFile_AddSeries(b, 200, 4, 7) } // ~1.9M series
func BenchmarkLogFile_WriteTo(b *testing.B) {
for _, seriesN := range []int{1000, 10000, 100000, 1000000} {
name := fmt.Sprintf("series=%d", seriesN)
b.Run(name, func(b *testing.B) {
f := MustOpenLogFile()
defer f.Close()
// Estimate bloom filter size.
m, k := bloom.Estimate(uint64(seriesN), 0.02)
// Initialize log file with series data.
for i := 0; i < seriesN; i++ {
if err := f.AddSeries(
[]byte("cpu"),
models.Tags{
{Key: []byte("host"), Value: []byte(fmt.Sprintf("server-%d", i))},
{Key: []byte("location"), Value: []byte("us-west")},
},
); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
// Create cpu profile for each subtest.
MustStartCPUProfile(name)
defer pprof.StopCPUProfile()
// Compact log file.
for i := 0; i < b.N; i++ {
buf := bytes.NewBuffer(make([]byte, 0, 150*seriesN))
if _, err := f.CompactTo(buf, m, k); err != nil {
b.Fatal(err)
}
b.Logf("sz=%db", buf.Len())
}
})
}
}
// MustStartCPUProfile starts a cpu profile in a temporary path based on name.
func MustStartCPUProfile(name string) {
name = regexp.MustCompile(`\W+`).ReplaceAllString(name, "-")
// Open file and start pprof.
f, err := os.Create(filepath.Join("/tmp", fmt.Sprintf("cpu-%s.pprof", name)))
if err != nil {
panic(err)
}
if err := pprof.StartCPUProfile(f); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,600 @@
package tsi1
import (
"bytes"
"encoding/binary"
"errors"
"io"
"sort"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/pkg/estimator/hll"
"github.com/influxdata/influxdb/pkg/rhh"
)
// MeasurementBlockVersion is the version of the measurement block.
const MeasurementBlockVersion = 1
// Measurement flag constants.
const (
MeasurementTombstoneFlag = 0x01
)
// Measurement field size constants.
const (
// 1 byte offset for the block to ensure non-zero offsets.
MeasurementFillSize = 1
// Measurement trailer fields
MeasurementTrailerSize = 0 +
2 + // version
8 + 8 + // data offset/size
8 + 8 + // hash index offset/size
8 + 8 + // measurement sketch offset/size
8 + 8 // tombstone measurement sketch offset/size
// Measurement key block fields.
MeasurementNSize = 8
MeasurementOffsetSize = 8
)
// Measurement errors.
var (
ErrUnsupportedMeasurementBlockVersion = errors.New("unsupported measurement block version")
ErrMeasurementBlockSizeMismatch = errors.New("measurement block size mismatch")
)
// MeasurementBlock represents a collection of all measurements in an index.
type MeasurementBlock struct {
data []byte
hashData []byte
// Series block sketch and tombstone sketch for cardinality estimation.
// While we have exact counts for the block, these sketches allow us to
// estimate cardinality across multiple blocks (which might contain
// duplicate series).
sketch, tSketch estimator.Sketch
version int // block version
}
// Version returns the encoding version parsed from the data.
// Only valid after UnmarshalBinary() has been successfully invoked.
func (blk *MeasurementBlock) Version() int { return blk.version }
// Elem returns an element for a measurement.
func (blk *MeasurementBlock) Elem(name []byte) (e MeasurementBlockElem, ok bool) {
n := int64(binary.BigEndian.Uint64(blk.hashData[:MeasurementNSize]))
hash := rhh.HashKey(name)
pos := hash % n
// Track current distance
var d int64
for {
// Find offset of measurement.
offset := binary.BigEndian.Uint64(blk.hashData[MeasurementNSize+(pos*MeasurementOffsetSize):])
if offset == 0 {
return MeasurementBlockElem{}, false
}
// Evaluate name if offset is not empty.
if offset > 0 {
// Parse into element.
var e MeasurementBlockElem
e.UnmarshalBinary(blk.data[offset:])
// Return if name match.
if bytes.Equal(e.name, name) {
return e, true
}
// Check if we've exceeded the probe distance.
if d > rhh.Dist(rhh.HashKey(e.name), pos, n) {
return MeasurementBlockElem{}, false
}
}
// Move position forward.
pos = (pos + 1) % n
d++
if d > n {
return MeasurementBlockElem{}, false
}
}
}
// UnmarshalBinary unpacks data into the block. Block is not copied so data
// should be retained and unchanged after being passed into this function.
func (blk *MeasurementBlock) UnmarshalBinary(data []byte) error {
// Read trailer.
t, err := ReadMeasurementBlockTrailer(data)
if err != nil {
return err
}
// Save data section.
blk.data = data[t.Data.Offset:]
blk.data = blk.data[:t.Data.Size]
// Save hash index block.
blk.hashData = data[t.HashIndex.Offset:]
blk.hashData = blk.hashData[:t.HashIndex.Size]
// Initialise sketches. We're currently using HLL+.
var s, ts = hll.NewDefaultPlus(), hll.NewDefaultPlus()
if err := s.UnmarshalBinary(data[t.Sketch.Offset:][:t.Sketch.Size]); err != nil {
return err
}
blk.sketch = s
if err := ts.UnmarshalBinary(data[t.TSketch.Offset:][:t.TSketch.Size]); err != nil {
return err
}
blk.tSketch = ts
return nil
}
// Iterator returns an iterator over all measurements.
func (blk *MeasurementBlock) Iterator() MeasurementIterator {
return &blockMeasurementIterator{data: blk.data[MeasurementFillSize:]}
}
// seriesIDIterator returns an iterator for all series ids in a measurement.
func (blk *MeasurementBlock) seriesIDIterator(name []byte) seriesIDIterator {
// Find measurement element.
e, ok := blk.Elem(name)
if !ok {
return &rawSeriesIDIterator{}
}
return &rawSeriesIDIterator{n: e.series.n, data: e.series.data}
}
// blockMeasurementIterator iterates over a list measurements in a block.
type blockMeasurementIterator struct {
elem MeasurementBlockElem
data []byte
}
// Next returns the next measurement. Returns nil when iterator is complete.
func (itr *blockMeasurementIterator) Next() MeasurementElem {
// Return nil when we run out of data.
if len(itr.data) == 0 {
return nil
}
// Unmarshal the element at the current position.
itr.elem.UnmarshalBinary(itr.data)
// Move the data forward past the record.
itr.data = itr.data[itr.elem.size:]
return &itr.elem
}
// rawSeriesIterator iterates over a list of raw series data.
type rawSeriesIDIterator struct {
prev uint32
n uint32
data []byte
}
// next returns the next decoded series.
func (itr *rawSeriesIDIterator) next() uint32 {
if len(itr.data) == 0 {
return 0
}
delta, n := binary.Uvarint(itr.data)
itr.data = itr.data[n:]
seriesID := itr.prev + uint32(delta)
itr.prev = seriesID
return seriesID
}
// MeasurementBlockTrailer represents meta data at the end of a MeasurementBlock.
type MeasurementBlockTrailer struct {
Version int // Encoding version
// Offset & size of data section.
Data struct {
Offset int64
Size int64
}
// Offset & size of hash map section.
HashIndex struct {
Offset int64
Size int64
}
// Offset and size of cardinality sketch for measurements.
Sketch struct {
Offset int64
Size int64
}
// Offset and size of cardinality sketch for tombstoned measurements.
TSketch struct {
Offset int64
Size int64
}
}
// ReadMeasurementBlockTrailer returns the block trailer from data.
func ReadMeasurementBlockTrailer(data []byte) (MeasurementBlockTrailer, error) {
var t MeasurementBlockTrailer
// Read version (which is located in the last two bytes of the trailer).
t.Version = int(binary.BigEndian.Uint16(data[len(data)-2:]))
if t.Version != MeasurementBlockVersion {
return t, ErrUnsupportedIndexFileVersion
}
// Slice trailer data.
buf := data[len(data)-MeasurementTrailerSize:]
// Read data section info.
t.Data.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.Data.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
// Read measurement block info.
t.HashIndex.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.HashIndex.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
// Read measurment sketch info.
t.Sketch.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.Sketch.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
// Read tombstone measurment sketch info.
t.TSketch.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.TSketch.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
return t, nil
}
// WriteTo writes the trailer to w.
func (t *MeasurementBlockTrailer) WriteTo(w io.Writer) (n int64, err error) {
// Write data section info.
if err := writeUint64To(w, uint64(t.Data.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.Data.Size), &n); err != nil {
return n, err
}
// Write hash index section info.
if err := writeUint64To(w, uint64(t.HashIndex.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.HashIndex.Size), &n); err != nil {
return n, err
}
// Write measurement sketch info.
if err := writeUint64To(w, uint64(t.Sketch.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.Sketch.Size), &n); err != nil {
return n, err
}
// Write tombstone measurement sketch info.
if err := writeUint64To(w, uint64(t.TSketch.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.TSketch.Size), &n); err != nil {
return n, err
}
// Write measurement block version.
if err := writeUint16To(w, MeasurementBlockVersion, &n); err != nil {
return n, err
}
return n, nil
}
// MeasurementBlockElem represents an internal measurement element.
type MeasurementBlockElem struct {
flag byte // flag
name []byte // measurement name
tagBlock struct {
offset int64
size int64
}
series struct {
n uint32 // series count
data []byte // serialized series data
}
// size in bytes, set after unmarshaling.
size int
}
// Name returns the measurement name.
func (e *MeasurementBlockElem) Name() []byte { return e.name }
// Deleted returns true if the tombstone flag is set.
func (e *MeasurementBlockElem) Deleted() bool {
return (e.flag & MeasurementTombstoneFlag) != 0
}
// TagBlockOffset returns the offset of the measurement's tag block.
func (e *MeasurementBlockElem) TagBlockOffset() int64 { return e.tagBlock.offset }
// TagBlockSize returns the size of the measurement's tag block.
func (e *MeasurementBlockElem) TagBlockSize() int64 { return e.tagBlock.size }
// SeriesData returns the raw series data.
func (e *MeasurementBlockElem) SeriesData() []byte { return e.series.data }
// SeriesN returns the number of series associated with the measurement.
func (e *MeasurementBlockElem) SeriesN() uint32 { return e.series.n }
// SeriesID returns series ID at an index.
func (e *MeasurementBlockElem) SeriesID(i int) uint32 {
return binary.BigEndian.Uint32(e.series.data[i*SeriesIDSize:])
}
// SeriesIDs returns a list of decoded series ids.
//
// NOTE: This should be used for testing and diagnostics purposes only.
// It requires loading the entire list of series in-memory.
func (e *MeasurementBlockElem) SeriesIDs() []uint32 {
a := make([]uint32, 0, e.series.n)
var prev uint32
for data := e.series.data; len(data) > 0; {
delta, n := binary.Uvarint(data)
data = data[n:]
seriesID := prev + uint32(delta)
a = append(a, seriesID)
prev = seriesID
}
return a
}
// Size returns the size of the element.
func (e *MeasurementBlockElem) Size() int { return e.size }
// UnmarshalBinary unmarshals data into e.
func (e *MeasurementBlockElem) UnmarshalBinary(data []byte) error {
start := len(data)
// Parse flag data.
e.flag, data = data[0], data[1:]
// Parse tag block offset.
e.tagBlock.offset, data = int64(binary.BigEndian.Uint64(data)), data[8:]
e.tagBlock.size, data = int64(binary.BigEndian.Uint64(data)), data[8:]
// Parse name.
sz, n := binary.Uvarint(data)
e.name, data = data[n:n+int(sz)], data[n+int(sz):]
// Parse series data.
v, n := binary.Uvarint(data)
e.series.n, data = uint32(v), data[n:]
sz, n = binary.Uvarint(data)
data = data[n:]
e.series.data, data = data[:sz], data[sz:]
// Save length of elem.
e.size = start - len(data)
return nil
}
// MeasurementBlockWriter writes a measurement block.
type MeasurementBlockWriter struct {
buf bytes.Buffer
mms map[string]measurement
// Measurement sketch and tombstoned measurement sketch.
sketch, tSketch estimator.Sketch
}
// NewMeasurementBlockWriter returns a new MeasurementBlockWriter.
func NewMeasurementBlockWriter() *MeasurementBlockWriter {
return &MeasurementBlockWriter{
mms: make(map[string]measurement),
sketch: hll.NewDefaultPlus(),
tSketch: hll.NewDefaultPlus(),
}
}
// Add adds a measurement with series and tag set offset/size.
func (mw *MeasurementBlockWriter) Add(name []byte, deleted bool, offset, size int64, seriesIDs []uint32) {
mm := mw.mms[string(name)]
mm.deleted = deleted
mm.tagBlock.offset = offset
mm.tagBlock.size = size
mm.seriesIDs = seriesIDs
mw.mms[string(name)] = mm
if deleted {
mw.tSketch.Add(name)
} else {
mw.sketch.Add(name)
}
}
// WriteTo encodes the measurements to w.
func (mw *MeasurementBlockWriter) WriteTo(w io.Writer) (n int64, err error) {
var t MeasurementBlockTrailer
// The sketches must be set before calling WriteTo.
if mw.sketch == nil {
return 0, errors.New("measurement sketch not set")
} else if mw.tSketch == nil {
return 0, errors.New("measurement tombstone sketch not set")
}
// Sort names.
names := make([]string, 0, len(mw.mms))
for name := range mw.mms {
names = append(names, name)
}
sort.Strings(names)
// Begin data section.
t.Data.Offset = n
// Write padding byte so no offsets are zero.
if err := writeUint8To(w, 0, &n); err != nil {
return n, err
}
// Encode key list.
for _, name := range names {
// Retrieve measurement and save offset.
mm := mw.mms[name]
mm.offset = n
mw.mms[name] = mm
// Write measurement
if err := mw.writeMeasurementTo(w, []byte(name), &mm, &n); err != nil {
return n, err
}
}
t.Data.Size = n - t.Data.Offset
// Build key hash map
m := rhh.NewHashMap(rhh.Options{
Capacity: int64(len(names)),
LoadFactor: LoadFactor,
})
for name := range mw.mms {
mm := mw.mms[name]
m.Put([]byte(name), &mm)
}
t.HashIndex.Offset = n
// Encode hash map length.
if err := writeUint64To(w, uint64(m.Cap()), &n); err != nil {
return n, err
}
// Encode hash map offset entries.
for i := int64(0); i < m.Cap(); i++ {
_, v := m.Elem(i)
var offset int64
if mm, ok := v.(*measurement); ok {
offset = mm.offset
}
if err := writeUint64To(w, uint64(offset), &n); err != nil {
return n, err
}
}
t.HashIndex.Size = n - t.HashIndex.Offset
// Write the sketches out.
t.Sketch.Offset = n
if err := writeSketchTo(w, mw.sketch, &n); err != nil {
return n, err
}
t.Sketch.Size = n - t.Sketch.Offset
t.TSketch.Offset = n
if err := writeSketchTo(w, mw.tSketch, &n); err != nil {
return n, err
}
t.TSketch.Size = n - t.TSketch.Offset
// Write trailer.
nn, err := t.WriteTo(w)
n += nn
if err != nil {
return n, err
}
return n, nil
}
// writeMeasurementTo encodes a single measurement entry into w.
func (mw *MeasurementBlockWriter) writeMeasurementTo(w io.Writer, name []byte, mm *measurement, n *int64) error {
// Write flag & tag block offset.
if err := writeUint8To(w, mm.flag(), n); err != nil {
return err
}
if err := writeUint64To(w, uint64(mm.tagBlock.offset), n); err != nil {
return err
} else if err := writeUint64To(w, uint64(mm.tagBlock.size), n); err != nil {
return err
}
// Write measurement name.
if err := writeUvarintTo(w, uint64(len(name)), n); err != nil {
return err
}
if err := writeTo(w, name, n); err != nil {
return err
}
// Write series data to buffer.
mw.buf.Reset()
var prev uint32
for _, seriesID := range mm.seriesIDs {
delta := seriesID - prev
var buf [binary.MaxVarintLen32]byte
i := binary.PutUvarint(buf[:], uint64(delta))
if _, err := mw.buf.Write(buf[:i]); err != nil {
return err
}
prev = seriesID
}
// Write series count.
if err := writeUvarintTo(w, uint64(len(mm.seriesIDs)), n); err != nil {
return err
}
// Write data size & buffer.
if err := writeUvarintTo(w, uint64(mw.buf.Len()), n); err != nil {
return err
}
nn, err := mw.buf.WriteTo(w)
if *n += nn; err != nil {
return err
}
return nil
}
// writeSketchTo writes an estimator.Sketch into w, updating the number of bytes
// written via n.
func writeSketchTo(w io.Writer, s estimator.Sketch, n *int64) error {
// TODO(edd): implement io.WriterTo on sketches.
data, err := s.MarshalBinary()
if err != nil {
return err
}
nn, err := w.Write(data)
*n += int64(nn)
return err
}
type measurement struct {
deleted bool
tagBlock struct {
offset int64
size int64
}
seriesIDs []uint32
offset int64
}
func (mm measurement) flag() byte {
var flag byte
if mm.deleted {
flag |= MeasurementTombstoneFlag
}
return flag
}

View File

@@ -0,0 +1,181 @@
package tsi1_test
import (
"bytes"
"encoding/binary"
"fmt"
"reflect"
"testing"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
func TestReadMeasurementBlockTrailer(t *testing.T) {
// Build a trailer
var (
data = make([]byte, tsi1.MeasurementTrailerSize)
blockversion = uint16(1)
blockOffset, blockSize = uint64(1), uint64(2500)
hashIdxOffset, hashIdxSize = uint64(2501), uint64(1000)
sketchOffset, sketchSize = uint64(3501), uint64(250)
tsketchOffset, tsketchSize = uint64(3751), uint64(250)
)
binary.BigEndian.PutUint64(data[0:], blockOffset)
binary.BigEndian.PutUint64(data[8:], blockSize)
binary.BigEndian.PutUint64(data[16:], hashIdxOffset)
binary.BigEndian.PutUint64(data[24:], hashIdxSize)
binary.BigEndian.PutUint64(data[32:], sketchOffset)
binary.BigEndian.PutUint64(data[40:], sketchSize)
binary.BigEndian.PutUint64(data[48:], tsketchOffset)
binary.BigEndian.PutUint64(data[56:], tsketchSize)
binary.BigEndian.PutUint16(data[64:], blockversion)
trailer, err := tsi1.ReadMeasurementBlockTrailer(data)
if err != nil {
t.Logf("trailer is: %#v\n", trailer)
t.Fatal(err)
}
ok := true &&
trailer.Version == int(blockversion) &&
trailer.Data.Offset == int64(blockOffset) &&
trailer.Data.Size == int64(blockSize) &&
trailer.HashIndex.Offset == int64(hashIdxOffset) &&
trailer.HashIndex.Size == int64(hashIdxSize) &&
trailer.Sketch.Offset == int64(sketchOffset) &&
trailer.Sketch.Size == int64(sketchSize) &&
trailer.TSketch.Offset == int64(tsketchOffset) &&
trailer.TSketch.Size == int64(tsketchSize)
if !ok {
t.Fatalf("got %v\nwhich does not match expected", trailer)
}
}
func TestMeasurementBlockTrailer_WriteTo(t *testing.T) {
var trailer = tsi1.MeasurementBlockTrailer{
Version: 1,
Data: struct {
Offset int64
Size int64
}{Offset: 1, Size: 2},
HashIndex: struct {
Offset int64
Size int64
}{Offset: 3, Size: 4},
Sketch: struct {
Offset int64
Size int64
}{Offset: 5, Size: 6},
TSketch: struct {
Offset int64
Size int64
}{Offset: 7, Size: 8},
}
var buf bytes.Buffer
n, err := trailer.WriteTo(&buf)
if got, exp := n, int64(tsi1.MeasurementTrailerSize); got != exp {
t.Fatalf("got %v, exp %v", got, exp)
}
if got := err; got != nil {
t.Fatalf("got %v, exp %v", got, nil)
}
// Verify trailer written correctly.
exp := "" +
"0000000000000001" + // data offset
"0000000000000002" + // data size
"0000000000000003" + // hash index offset
"0000000000000004" + // hash index size
"0000000000000005" + // sketch offset
"0000000000000006" + // sketch size
"0000000000000007" + // tsketch offset
"0000000000000008" + // tsketch size
"0001" // version
if got, exp := fmt.Sprintf("%x", buf.String()), exp; got != exp {
t.Fatalf("got %v, exp %v", got, exp)
}
}
// Ensure measurement blocks can be written and opened.
func TestMeasurementBlockWriter(t *testing.T) {
ms := Measurements{
NewMeasurement([]byte("foo"), false, 100, 10, []uint32{1, 3, 4}),
NewMeasurement([]byte("bar"), false, 200, 20, []uint32{2}),
NewMeasurement([]byte("baz"), false, 300, 30, []uint32{5, 6}),
}
// Write the measurements to writer.
mw := tsi1.NewMeasurementBlockWriter()
for _, m := range ms {
mw.Add(m.Name, m.Deleted, m.Offset, m.Size, m.ids)
}
// Encode into buffer.
var buf bytes.Buffer
if n, err := mw.WriteTo(&buf); err != nil {
t.Fatal(err)
} else if n == 0 {
t.Fatal("expected bytes written")
}
// Unmarshal into a block.
var blk tsi1.MeasurementBlock
if err := blk.UnmarshalBinary(buf.Bytes()); err != nil {
t.Fatal(err)
}
// Verify data in block.
if e, ok := blk.Elem([]byte("foo")); !ok {
t.Fatal("expected element")
} else if e.TagBlockOffset() != 100 || e.TagBlockSize() != 10 {
t.Fatalf("unexpected offset/size: %v/%v", e.TagBlockOffset(), e.TagBlockSize())
} else if !reflect.DeepEqual(e.SeriesIDs(), []uint32{1, 3, 4}) {
t.Fatalf("unexpected series data: %#v", e.SeriesIDs())
}
if e, ok := blk.Elem([]byte("bar")); !ok {
t.Fatal("expected element")
} else if e.TagBlockOffset() != 200 || e.TagBlockSize() != 20 {
t.Fatalf("unexpected offset/size: %v/%v", e.TagBlockOffset(), e.TagBlockSize())
} else if !reflect.DeepEqual(e.SeriesIDs(), []uint32{2}) {
t.Fatalf("unexpected series data: %#v", e.SeriesIDs())
}
if e, ok := blk.Elem([]byte("baz")); !ok {
t.Fatal("expected element")
} else if e.TagBlockOffset() != 300 || e.TagBlockSize() != 30 {
t.Fatalf("unexpected offset/size: %v/%v", e.TagBlockOffset(), e.TagBlockSize())
} else if !reflect.DeepEqual(e.SeriesIDs(), []uint32{5, 6}) {
t.Fatalf("unexpected series data: %#v", e.SeriesIDs())
}
// Verify non-existent measurement doesn't exist.
if _, ok := blk.Elem([]byte("BAD_MEASUREMENT")); ok {
t.Fatal("expected no element")
}
}
type Measurements []Measurement
type Measurement struct {
Name []byte
Deleted bool
Offset int64
Size int64
ids []uint32
}
func NewMeasurement(name []byte, deleted bool, offset, size int64, ids []uint32) Measurement {
return Measurement{
Name: name,
Deleted: deleted,
Offset: offset,
Size: size,
ids: ids,
}
}

View File

@@ -0,0 +1,989 @@
package tsi1
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"sort"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/bloom"
"github.com/influxdata/influxdb/pkg/estimator"
"github.com/influxdata/influxdb/pkg/estimator/hll"
"github.com/influxdata/influxdb/pkg/mmap"
"github.com/influxdata/influxdb/pkg/rhh"
)
// ErrSeriesOverflow is returned when too many series are added to a series writer.
var ErrSeriesOverflow = errors.New("series overflow")
// Series list field size constants.
const (
// Series list trailer field sizes.
SeriesBlockTrailerSize = 0 +
4 + 4 + // series data offset/size
4 + 4 + 4 + // series index offset/size/capacity
8 + 4 + 4 + // bloom filter false positive rate, offset/size
4 + 4 + // series sketch offset/size
4 + 4 + // tombstone series sketch offset/size
4 + 4 + // series count and tombstone count
0
// Other field sizes
SeriesCountSize = 4
SeriesIDSize = 4
)
// Series flag constants.
const (
// Marks the series as having been deleted.
SeriesTombstoneFlag = 0x01
// Marks the following bytes as a hash index.
// These bytes should be skipped by an iterator.
SeriesHashIndexFlag = 0x02
)
// MaxSeriesBlockHashSize is the maximum number of series in a single hash.
const MaxSeriesBlockHashSize = (65536 * LoadFactor) / 100
// SeriesBlock represents the section of the index that holds series data.
type SeriesBlock struct {
data []byte
// Series data & index/capacity.
seriesData []byte
seriesIndexes []seriesBlockIndex
// Exact series counts for this block.
seriesN int32
tombstoneN int32
// Bloom filter used for fast series existence check.
filter *bloom.Filter
// Series block sketch and tombstone sketch for cardinality estimation.
// While we have exact counts for the block, these sketches allow us to
// estimate cardinality across multiple blocks (which might contain
// duplicate series).
sketch, tsketch estimator.Sketch
}
// HasSeries returns flags indicating if the series exists and if it is tombstoned.
func (blk *SeriesBlock) HasSeries(name []byte, tags models.Tags, buf []byte) (exists, tombstoned bool) {
offset, tombstoned := blk.Offset(name, tags, buf)
return offset != 0, tombstoned
}
// Series returns a series element.
func (blk *SeriesBlock) Series(name []byte, tags models.Tags) SeriesElem {
offset, _ := blk.Offset(name, tags, nil)
if offset == 0 {
return nil
}
var e SeriesBlockElem
e.UnmarshalBinary(blk.data[offset:])
return &e
}
// Offset returns the byte offset of the series within the block.
func (blk *SeriesBlock) Offset(name []byte, tags models.Tags, buf []byte) (offset uint32, tombstoned bool) {
// Exit if no series indexes exist.
if len(blk.seriesIndexes) == 0 {
return 0, false
}
// Compute series key.
buf = AppendSeriesKey(buf[:0], name, tags)
bufN := uint32(len(buf))
// Quickly check the bloom filter.
// If the key doesn't exist then we know for sure that it doesn't exist.
// If it does exist then we need to do a hash index check to verify. False
// positives are possible with a bloom filter.
if !blk.filter.Contains(buf) {
return 0, false
}
// Find the correct partition.
// Use previous index unless an exact match on the min value.
i := sort.Search(len(blk.seriesIndexes), func(i int) bool {
return CompareSeriesKeys(blk.seriesIndexes[i].min, buf) != -1
})
if i >= len(blk.seriesIndexes) || !bytes.Equal(blk.seriesIndexes[i].min, buf) {
i--
}
seriesIndex := blk.seriesIndexes[i]
// Search within partition.
n := int64(seriesIndex.capacity)
hash := rhh.HashKey(buf)
pos := hash % n
// Track current distance
var d int64
for {
// Find offset of series.
offset := binary.BigEndian.Uint32(seriesIndex.data[pos*SeriesIDSize:])
if offset == 0 {
return 0, false
}
// Evaluate encoded value matches expected.
key := ReadSeriesKey(blk.data[offset+1 : offset+1+bufN])
if bytes.Equal(buf, key) {
return offset, (blk.data[offset] & SeriesTombstoneFlag) != 0
}
// Check if we've exceeded the probe distance.
max := rhh.Dist(rhh.HashKey(key), pos, n)
if d > max {
return 0, false
}
// Move position forward.
pos = (pos + 1) % n
d++
if d > n {
return 0, false
}
}
}
// SeriesCount returns the number of series.
func (blk *SeriesBlock) SeriesCount() uint32 {
return uint32(blk.seriesN + blk.tombstoneN)
}
// SeriesIterator returns an iterator over all the series.
func (blk *SeriesBlock) SeriesIterator() SeriesIterator {
return &seriesBlockIterator{
n: blk.SeriesCount(),
offset: 1,
sblk: blk,
}
}
// UnmarshalBinary unpacks data into the series list.
//
// If data is an mmap then it should stay open until the series list is no
// longer used because data access is performed directly from the byte slice.
func (blk *SeriesBlock) UnmarshalBinary(data []byte) error {
t := ReadSeriesBlockTrailer(data)
// Save entire block.
blk.data = data
// Slice series data.
blk.seriesData = data[t.Series.Data.Offset:]
blk.seriesData = blk.seriesData[:t.Series.Data.Size]
// Read in all index partitions.
buf := data[t.Series.Index.Offset:]
buf = buf[:t.Series.Index.Size]
blk.seriesIndexes = make([]seriesBlockIndex, t.Series.Index.N)
for i := range blk.seriesIndexes {
idx := &blk.seriesIndexes[i]
// Read data block.
var offset, size uint32
offset, buf = binary.BigEndian.Uint32(buf[:4]), buf[4:]
size, buf = binary.BigEndian.Uint32(buf[:4]), buf[4:]
idx.data = blk.data[offset : offset+size]
// Read block capacity.
idx.capacity, buf = int32(binary.BigEndian.Uint32(buf[:4])), buf[4:]
// Read min key.
var n uint32
n, buf = binary.BigEndian.Uint32(buf[:4]), buf[4:]
idx.min, buf = buf[:n], buf[n:]
}
if len(buf) != 0 {
return fmt.Errorf("data remaining in index list buffer: %d", len(buf))
}
// Initialize bloom filter.
filter, err := bloom.NewFilterBuffer(data[t.Bloom.Offset:][:t.Bloom.Size], t.Bloom.K)
if err != nil {
return err
}
blk.filter = filter
// Initialise sketches. We're currently using HLL+.
var s, ts = hll.NewDefaultPlus(), hll.NewDefaultPlus()
if err := s.UnmarshalBinary(data[t.Sketch.Offset:][:t.Sketch.Size]); err != nil {
return err
}
blk.sketch = s
if err := ts.UnmarshalBinary(data[t.TSketch.Offset:][:t.TSketch.Size]); err != nil {
return err
}
blk.tsketch = ts
// Set the series and tombstone counts
blk.seriesN, blk.tombstoneN = t.SeriesN, t.TombstoneN
return nil
}
// seriesBlockIndex represents a partitioned series block index.
type seriesBlockIndex struct {
data []byte
min []byte
capacity int32
}
// seriesBlockIterator is an iterator over a series ids in a series list.
type seriesBlockIterator struct {
i, n uint32
offset uint32
sblk *SeriesBlock
e SeriesBlockElem // buffer
}
// Next returns the next series element.
func (itr *seriesBlockIterator) Next() SeriesElem {
for {
// Exit if at the end.
if itr.i == itr.n {
return nil
}
// If the current element is a hash index partition then skip it.
if flag := itr.sblk.data[itr.offset]; flag&SeriesHashIndexFlag != 0 {
// Skip flag
itr.offset++
// Read index capacity.
n := binary.BigEndian.Uint32(itr.sblk.data[itr.offset:])
itr.offset += 4
// Skip over index.
itr.offset += n * SeriesIDSize
continue
}
// Read next element.
itr.e.UnmarshalBinary(itr.sblk.data[itr.offset:])
// Move iterator and offset forward.
itr.i++
itr.offset += uint32(itr.e.size)
return &itr.e
}
}
// seriesDecodeIterator decodes a series id iterator into unmarshaled elements.
type seriesDecodeIterator struct {
itr seriesIDIterator
sblk *SeriesBlock
e SeriesBlockElem // buffer
}
// newSeriesDecodeIterator returns a new instance of seriesDecodeIterator.
func newSeriesDecodeIterator(sblk *SeriesBlock, itr seriesIDIterator) *seriesDecodeIterator {
return &seriesDecodeIterator{sblk: sblk, itr: itr}
}
// Next returns the next series element.
func (itr *seriesDecodeIterator) Next() SeriesElem {
// Read next series id.
id := itr.itr.next()
if id == 0 {
return nil
}
// Read next element.
itr.e.UnmarshalBinary(itr.sblk.data[id:])
return &itr.e
}
// SeriesBlockElem represents a series element in the series list.
type SeriesBlockElem struct {
flag byte
name []byte
tags models.Tags
size int
}
// Deleted returns true if the tombstone flag is set.
func (e *SeriesBlockElem) Deleted() bool { return (e.flag & SeriesTombstoneFlag) != 0 }
// Name returns the measurement name.
func (e *SeriesBlockElem) Name() []byte { return e.name }
// Tags returns the tag set.
func (e *SeriesBlockElem) Tags() models.Tags { return e.tags }
// Expr always returns a nil expression.
// This is only used by higher level query planning.
func (e *SeriesBlockElem) Expr() influxql.Expr { return nil }
// UnmarshalBinary unmarshals data into e.
func (e *SeriesBlockElem) UnmarshalBinary(data []byte) error {
start := len(data)
// Parse flag data.
e.flag, data = data[0], data[1:]
// Parse total size.
_, szN := binary.Uvarint(data)
data = data[szN:]
// Parse name.
n, data := binary.BigEndian.Uint16(data[:2]), data[2:]
e.name, data = data[:n], data[n:]
// Parse tags.
e.tags = e.tags[:0]
tagN, szN := binary.Uvarint(data)
data = data[szN:]
for i := uint64(0); i < tagN; i++ {
var tag models.Tag
n, data = binary.BigEndian.Uint16(data[:2]), data[2:]
tag.Key, data = data[:n], data[n:]
n, data = binary.BigEndian.Uint16(data[:2]), data[2:]
tag.Value, data = data[:n], data[n:]
e.tags = append(e.tags, tag)
}
// Save length of elem.
e.size = start - len(data)
return nil
}
// AppendSeriesElem serializes flag/name/tags to dst and returns the new buffer.
func AppendSeriesElem(dst []byte, flag byte, name []byte, tags models.Tags) []byte {
dst = append(dst, flag)
return AppendSeriesKey(dst, name, tags)
}
// AppendSeriesKey serializes name and tags to a byte slice.
// The total length is prepended as a uvarint.
func AppendSeriesKey(dst []byte, name []byte, tags models.Tags) []byte {
buf := make([]byte, binary.MaxVarintLen32)
origLen := len(dst)
// The tag count is variable encoded, so we need to know ahead of time what
// the size of the tag count value will be.
tcBuf := make([]byte, binary.MaxVarintLen32)
tcSz := binary.PutUvarint(tcBuf, uint64(len(tags)))
// Size of name/tags. Does not include total length.
size := 0 + //
2 + // size of measurement
len(name) + // measurement
tcSz + // size of number of tags
(4 * len(tags)) + // length of each tag key and value
tags.Size() // size of tag keys/values
// Variable encode length.
totalSz := binary.PutUvarint(buf, uint64(size))
// If caller doesn't provide a buffer then pre-allocate an exact one.
if dst == nil {
dst = make([]byte, 0, size+totalSz)
}
// Append total length.
dst = append(dst, buf[:totalSz]...)
// Append name.
binary.BigEndian.PutUint16(buf, uint16(len(name)))
dst = append(dst, buf[:2]...)
dst = append(dst, name...)
// Append tag count.
dst = append(dst, tcBuf[:tcSz]...)
// Append tags.
for _, tag := range tags {
binary.BigEndian.PutUint16(buf, uint16(len(tag.Key)))
dst = append(dst, buf[:2]...)
dst = append(dst, tag.Key...)
binary.BigEndian.PutUint16(buf, uint16(len(tag.Value)))
dst = append(dst, buf[:2]...)
dst = append(dst, tag.Value...)
}
// Verify that the total length equals the encoded byte count.
if got, exp := len(dst)-origLen, size+totalSz; got != exp {
panic(fmt.Sprintf("series key encoding does not match calculated total length: actual=%d, exp=%d, key=%x", got, exp, dst))
}
return dst
}
// ReadSeriesKey returns the series key from the beginning of the buffer.
func ReadSeriesKey(data []byte) []byte {
sz, n := binary.Uvarint(data)
return data[:int(sz)+n]
}
func CompareSeriesKeys(a, b []byte) int {
// Handle 'nil' keys.
if len(a) == 0 && len(b) == 0 {
return 0
} else if len(a) == 0 {
return -1
} else if len(b) == 0 {
return 1
}
// Read total size.
_, i := binary.Uvarint(a)
a = a[i:]
_, i = binary.Uvarint(b)
b = b[i:]
// Read names.
var n uint16
n, a = binary.BigEndian.Uint16(a), a[2:]
name0, a := a[:n], a[n:]
n, b = binary.BigEndian.Uint16(b), b[2:]
name1, b := b[:n], b[n:]
// Compare names, return if not equal.
if cmp := bytes.Compare(name0, name1); cmp != 0 {
return cmp
}
// Read tag counts.
tagN0, i := binary.Uvarint(a)
a = a[i:]
tagN1, i := binary.Uvarint(b)
b = b[i:]
// Compare each tag in order.
for i := uint64(0); ; i++ {
// Check for EOF.
if i == tagN0 && i == tagN1 {
return 0
} else if i == tagN0 {
return -1
} else if i == tagN1 {
return 1
}
// Read keys.
var key0, key1 []byte
n, a = binary.BigEndian.Uint16(a), a[2:]
key0, a = a[:n], a[n:]
n, b = binary.BigEndian.Uint16(b), b[2:]
key1, b = b[:n], b[n:]
// Compare keys.
if cmp := bytes.Compare(key0, key1); cmp != 0 {
return cmp
}
// Read values.
var value0, value1 []byte
n, a = binary.BigEndian.Uint16(a), a[2:]
value0, a = a[:n], a[n:]
n, b = binary.BigEndian.Uint16(b), b[2:]
value1, b = b[:n], b[n:]
// Compare values.
if cmp := bytes.Compare(value0, value1); cmp != 0 {
return cmp
}
}
}
type seriesKeys [][]byte
func (a seriesKeys) Len() int { return len(a) }
func (a seriesKeys) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a seriesKeys) Less(i, j int) bool {
return CompareSeriesKeys(a[i], a[j]) == -1
}
// SeriesBlockEncoder encodes series to a SeriesBlock in an underlying writer.
type SeriesBlockEncoder struct {
w io.Writer
// Double buffer for writing series.
// First elem is current buffer, second is previous buffer.
buf [2][]byte
// Track bytes written, sections, & offsets.
n int64
trailer SeriesBlockTrailer
offsets *rhh.HashMap
indexMin []byte
indexes []seriesBlockIndexEncodeInfo
// Bloom filter to check for series existance.
filter *bloom.Filter
// Series sketch and tombstoned series sketch. These must be
// set before calling WriteTo.
sketch, tSketch estimator.Sketch
}
// NewSeriesBlockEncoder returns a new instance of SeriesBlockEncoder.
func NewSeriesBlockEncoder(w io.Writer, n uint32, m, k uint64) *SeriesBlockEncoder {
return &SeriesBlockEncoder{
w: w,
offsets: rhh.NewHashMap(rhh.Options{
Capacity: MaxSeriesBlockHashSize,
LoadFactor: LoadFactor,
}),
filter: bloom.NewFilter(m, k),
sketch: hll.NewDefaultPlus(),
tSketch: hll.NewDefaultPlus(),
}
}
// N returns the number of bytes written.
func (enc *SeriesBlockEncoder) N() int64 { return enc.n }
// Encode writes a series to the underlying writer.
// The series must be lexicographical sorted after the previous encoded series.
func (enc *SeriesBlockEncoder) Encode(name []byte, tags models.Tags, deleted bool) error {
// An initial empty byte must be written.
if err := enc.ensureHeaderWritten(); err != nil {
return err
}
// Generate the series element.
buf := AppendSeriesElem(enc.buf[0][:0], encodeSerieFlag(deleted), name, tags)
// Verify series is after previous series.
if enc.buf[1] != nil {
// Skip the first byte since it is the flag. Remaining bytes are key.
key0, key1 := buf[1:], enc.buf[1][1:]
if cmp := CompareSeriesKeys(key0, key1); cmp == -1 {
return fmt.Errorf("series out of order: prev=%q, new=%q", enc.buf[1], buf)
} else if cmp == 0 {
return fmt.Errorf("series already encoded: %s", buf)
}
}
// Flush a hash index, if necessary.
if err := enc.checkFlushIndex(buf[1:]); err != nil {
return err
}
// Swap double buffer.
enc.buf[0], enc.buf[1] = enc.buf[1], buf
// Write encoded series to writer.
offset := enc.n
if err := writeTo(enc.w, buf, &enc.n); err != nil {
return err
}
// Save offset to generate index later.
// Key is copied by the RHH map.
enc.offsets.Put(buf[1:], uint32(offset))
// Update bloom filter.
enc.filter.Insert(buf[1:])
// Update sketches & trailer.
if deleted {
enc.trailer.TombstoneN++
enc.tSketch.Add(buf)
} else {
enc.trailer.SeriesN++
enc.sketch.Add(buf)
}
return nil
}
// Close writes the index and trailer.
// This should be called at the end once all series have been encoded.
func (enc *SeriesBlockEncoder) Close() error {
if err := enc.ensureHeaderWritten(); err != nil {
return err
}
// Flush outstanding hash index.
if err := enc.flushIndex(); err != nil {
return err
}
// Write dictionary-encoded series list.
enc.trailer.Series.Data.Offset = 1
enc.trailer.Series.Data.Size = int32(enc.n) - enc.trailer.Series.Data.Offset
// Write dictionary-encoded series hash index.
enc.trailer.Series.Index.Offset = int32(enc.n)
if err := enc.writeIndexEntries(); err != nil {
return err
}
enc.trailer.Series.Index.Size = int32(enc.n) - enc.trailer.Series.Index.Offset
// Flush bloom filter.
enc.trailer.Bloom.K = enc.filter.K()
enc.trailer.Bloom.Offset = int32(enc.n)
if err := writeTo(enc.w, enc.filter.Bytes(), &enc.n); err != nil {
return err
}
enc.trailer.Bloom.Size = int32(enc.n) - enc.trailer.Bloom.Offset
// Write the sketches out.
enc.trailer.Sketch.Offset = int32(enc.n)
if err := writeSketchTo(enc.w, enc.sketch, &enc.n); err != nil {
return err
}
enc.trailer.Sketch.Size = int32(enc.n) - enc.trailer.Sketch.Offset
enc.trailer.TSketch.Offset = int32(enc.n)
if err := writeSketchTo(enc.w, enc.tSketch, &enc.n); err != nil {
return err
}
enc.trailer.TSketch.Size = int32(enc.n) - enc.trailer.TSketch.Offset
// Write trailer.
nn, err := enc.trailer.WriteTo(enc.w)
enc.n += nn
if err != nil {
return err
}
return nil
}
// writeIndexEntries writes a list of series hash index entries.
func (enc *SeriesBlockEncoder) writeIndexEntries() error {
enc.trailer.Series.Index.N = int32(len(enc.indexes))
for _, idx := range enc.indexes {
// Write offset/size.
if err := writeUint32To(enc.w, uint32(idx.offset), &enc.n); err != nil {
return err
} else if err := writeUint32To(enc.w, uint32(idx.size), &enc.n); err != nil {
return err
}
// Write capacity.
if err := writeUint32To(enc.w, uint32(idx.capacity), &enc.n); err != nil {
return err
}
// Write min key.
if err := writeUint32To(enc.w, uint32(len(idx.min)), &enc.n); err != nil {
return err
} else if err := writeTo(enc.w, idx.min, &enc.n); err != nil {
return err
}
}
return nil
}
// ensureHeaderWritten writes a single empty byte at the front of the file
// so that series offsets will always be non-zero.
func (enc *SeriesBlockEncoder) ensureHeaderWritten() error {
if enc.n > 0 {
return nil
}
if _, err := enc.w.Write([]byte{0}); err != nil {
return err
}
enc.n++
return nil
}
// checkFlushIndex flushes a hash index segment if the index is too large.
// The min argument specifies the lowest series key in the next index, if one is created.
func (enc *SeriesBlockEncoder) checkFlushIndex(min []byte) error {
// Ignore if there is still room in the index.
if enc.offsets.Len() < MaxSeriesBlockHashSize {
return nil
}
// Flush index values.
if err := enc.flushIndex(); err != nil {
return nil
}
// Reset index and save minimum series key.
enc.offsets.Reset()
enc.indexMin = make([]byte, len(min))
copy(enc.indexMin, min)
return nil
}
// flushIndex flushes the hash index segment.
func (enc *SeriesBlockEncoder) flushIndex() error {
if enc.offsets.Len() == 0 {
return nil
}
// Write index segment flag.
if err := writeUint8To(enc.w, SeriesHashIndexFlag, &enc.n); err != nil {
return err
}
// Write index capacity.
// This is used for skipping over when iterating sequentially.
if err := writeUint32To(enc.w, uint32(enc.offsets.Cap()), &enc.n); err != nil {
return err
}
// Determine size.
var sz int64 = enc.offsets.Cap() * 4
// Save current position to ensure size is correct by the end.
offset := enc.n
// Encode hash map offset entries.
for i := int64(0); i < enc.offsets.Cap(); i++ {
_, v := enc.offsets.Elem(i)
seriesOffset, _ := v.(uint32)
if err := writeUint32To(enc.w, uint32(seriesOffset), &enc.n); err != nil {
return err
}
}
// Determine total size.
size := enc.n - offset
// Verify actual size equals calculated size.
if size != sz {
return fmt.Errorf("series hash index size mismatch: %d <> %d", size, sz)
}
// Add to index entries.
enc.indexes = append(enc.indexes, seriesBlockIndexEncodeInfo{
offset: uint32(offset),
size: uint32(size),
capacity: uint32(enc.offsets.Cap()),
min: enc.indexMin,
})
// Clear next min.
enc.indexMin = nil
return nil
}
// seriesBlockIndexEncodeInfo stores offset information for seriesBlockIndex structures.
type seriesBlockIndexEncodeInfo struct {
offset uint32
size uint32
capacity uint32
min []byte
}
// ReadSeriesBlockTrailer returns the series list trailer from data.
func ReadSeriesBlockTrailer(data []byte) SeriesBlockTrailer {
var t SeriesBlockTrailer
// Slice trailer data.
buf := data[len(data)-SeriesBlockTrailerSize:]
// Read series data info.
t.Series.Data.Offset, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.Series.Data.Size, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
// Read series hash index info.
t.Series.Index.Offset, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.Series.Index.Size, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.Series.Index.N, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
// Read bloom filter info.
t.Bloom.K, buf = binary.BigEndian.Uint64(buf[0:8]), buf[8:]
t.Bloom.Offset, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.Bloom.Size, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
// Read series sketch info.
t.Sketch.Offset, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.Sketch.Size, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
// Read tombstone series sketch info.
t.TSketch.Offset, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.TSketch.Size, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
// Read series & tombstone count.
t.SeriesN, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
t.TombstoneN, buf = int32(binary.BigEndian.Uint32(buf[0:4])), buf[4:]
return t
}
// SeriesBlockTrailer represents meta data written to the end of the series list.
type SeriesBlockTrailer struct {
Series struct {
Data struct {
Offset int32
Size int32
}
Index struct {
Offset int32
Size int32
N int32
}
}
// Bloom filter info.
Bloom struct {
K uint64
Offset int32
Size int32
}
// Offset and size of cardinality sketch for measurements.
Sketch struct {
Offset int32
Size int32
}
// Offset and size of cardinality sketch for tombstoned measurements.
TSketch struct {
Offset int32
Size int32
}
SeriesN int32
TombstoneN int32
}
func (t SeriesBlockTrailer) WriteTo(w io.Writer) (n int64, err error) {
if err := writeUint32To(w, uint32(t.Series.Data.Offset), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.Series.Data.Size), &n); err != nil {
return n, err
}
if err := writeUint32To(w, uint32(t.Series.Index.Offset), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.Series.Index.Size), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.Series.Index.N), &n); err != nil {
return n, err
}
// Write bloom filter info.
if err := writeUint64To(w, t.Bloom.K, &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.Bloom.Offset), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.Bloom.Size), &n); err != nil {
return n, err
}
// Write measurement sketch info.
if err := writeUint32To(w, uint32(t.Sketch.Offset), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.Sketch.Size), &n); err != nil {
return n, err
}
// Write tombstone measurement sketch info.
if err := writeUint32To(w, uint32(t.TSketch.Offset), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.TSketch.Size), &n); err != nil {
return n, err
}
// Write series and tombstone count.
if err := writeUint32To(w, uint32(t.SeriesN), &n); err != nil {
return n, err
} else if err := writeUint32To(w, uint32(t.TombstoneN), &n); err != nil {
return n, err
}
return n, nil
}
type serie struct {
name []byte
tags models.Tags
deleted bool
offset uint32
}
func (s *serie) flag() uint8 { return encodeSerieFlag(s.deleted) }
func encodeSerieFlag(deleted bool) byte {
var flag byte
if deleted {
flag |= SeriesTombstoneFlag
}
return flag
}
type series []serie
func (a series) Len() int { return len(a) }
func (a series) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a series) Less(i, j int) bool {
if cmp := bytes.Compare(a[i].name, a[j].name); cmp != 0 {
return cmp == -1
}
return models.CompareTags(a[i].tags, a[j].tags) == -1
}
// mapIndexFileSeriesBlock maps a writer to a series block.
// Returns the series block and the mmap byte slice (if mmap is used).
// The memory-mapped slice MUST be unmapped by the caller.
func mapIndexFileSeriesBlock(w io.Writer) (*SeriesBlock, []byte, error) {
switch w := w.(type) {
case *bytes.Buffer:
return mapIndexFileSeriesBlockBuffer(w)
case *os.File:
return mapIndexFileSeriesBlockFile(w)
default:
return nil, nil, fmt.Errorf("invalid tsi1 writer type: %T", w)
}
}
// mapIndexFileSeriesBlockBuffer maps a buffer to a series block.
func mapIndexFileSeriesBlockBuffer(buf *bytes.Buffer) (*SeriesBlock, []byte, error) {
data := buf.Bytes()
data = data[len(FileSignature):] // Skip file signature.
var sblk SeriesBlock
if err := sblk.UnmarshalBinary(data); err != nil {
return nil, nil, err
}
return &sblk, nil, nil
}
// mapIndexFileSeriesBlockFile memory-maps a file to a series block.
func mapIndexFileSeriesBlockFile(f *os.File) (*SeriesBlock, []byte, error) {
// Open a read-only memory map of the existing data.
data, err := mmap.Map(f.Name())
if err != nil {
return nil, nil, err
}
sblk_data := data[len(FileSignature):] // Skip file signature.
// Unmarshal block on top of mmap.
var sblk SeriesBlock
if err := sblk.UnmarshalBinary(sblk_data); err != nil {
mmap.Unmap(data)
return nil, nil, err
}
return &sblk, data, nil
}

View File

@@ -0,0 +1,94 @@
package tsi1_test
import (
"bytes"
"fmt"
"testing"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Ensure series block can be unmarshaled.
func TestSeriesBlock_UnmarshalBinary(t *testing.T) {
if _, err := CreateSeriesBlock([]Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
}); err != nil {
t.Fatal(err)
}
}
// Ensure series block contains the correct set of series.
func TestSeriesBlock_Series(t *testing.T) {
series := []Series{
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "east"})},
{Name: []byte("cpu"), Tags: models.NewTags(map[string]string{"region": "west"})},
{Name: []byte("mem"), Tags: models.NewTags(map[string]string{"region": "east"})},
}
l := MustCreateSeriesBlock(series)
// Verify total number of series is correct.
if n := l.SeriesCount(); n != 3 {
t.Fatalf("unexpected series count: %d", n)
}
// Verify all series exist.
for i, s := range series {
if e := l.Series(s.Name, s.Tags); e == nil {
t.Fatalf("series does not exist: i=%d", i)
} else if !bytes.Equal(e.Name(), s.Name) || models.CompareTags(e.Tags(), s.Tags) != 0 {
t.Fatalf("series element does not match: i=%d, %s (%s) != %s (%s)", i, e.Name(), e.Tags().String(), s.Name, s.Tags.String())
} else if e.Deleted() {
t.Fatalf("series deleted: i=%d", i)
}
}
// Verify non-existent series doesn't exist.
if e := l.Series([]byte("foo"), models.NewTags(map[string]string{"region": "north"})); e != nil {
t.Fatalf("series should not exist: %#v", e)
}
}
// CreateSeriesBlock returns an in-memory SeriesBlock with a list of series.
func CreateSeriesBlock(a []Series) (*tsi1.SeriesBlock, error) {
var buf bytes.Buffer
// Create writer and sketches. Add series.
enc := tsi1.NewSeriesBlockEncoder(&buf, uint32(len(a)), M, K)
for i, s := range a {
if err := enc.Encode(s.Name, s.Tags, s.Deleted); err != nil {
return nil, fmt.Errorf("SeriesBlockWriter.Add(): i=%d, err=%s", i, err)
}
}
// Close and flush.
if err := enc.Close(); err != nil {
return nil, fmt.Errorf("SeriesBlockWriter.WriteTo(): %s", err)
}
// Unpack bytes into series block.
var blk tsi1.SeriesBlock
if err := blk.UnmarshalBinary(buf.Bytes()); err != nil {
return nil, fmt.Errorf("SeriesBlock.UnmarshalBinary(): %s", err)
}
return &blk, nil
}
// MustCreateSeriesBlock calls CreateSeriesBlock(). Panic on error.
func MustCreateSeriesBlock(a []Series) *tsi1.SeriesBlock {
l, err := CreateSeriesBlock(a)
if err != nil {
panic(err)
}
return l
}
// Series represents name/tagset pairs that are used in testing.
type Series struct {
Name []byte
Tags models.Tags
Deleted bool
}

View File

@@ -0,0 +1,752 @@
package tsi1
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/influxdata/influxdb/pkg/rhh"
)
// TagBlockVersion is the version of the tag block.
const TagBlockVersion = 1
// Tag key flag constants.
const (
TagKeyTombstoneFlag = 0x01
)
// Tag value flag constants.
const (
TagValueTombstoneFlag = 0x01
)
// TagBlock variable size constants.
const (
// TagBlock key block fields.
TagKeyNSize = 8
TagKeyOffsetSize = 8
// TagBlock value block fields.
TagValueNSize = 8
TagValueOffsetSize = 8
)
// TagBlock errors.
var (
ErrUnsupportedTagBlockVersion = errors.New("unsupported tag block version")
ErrTagBlockSizeMismatch = errors.New("tag block size mismatch")
)
// TagBlock represents tag key/value block for a single measurement.
type TagBlock struct {
data []byte
valueData []byte
keyData []byte
hashData []byte
version int // tag block version
}
// Version returns the encoding version parsed from the data.
// Only valid after UnmarshalBinary() has been successfully invoked.
func (blk *TagBlock) Version() int { return blk.version }
// UnmarshalBinary unpacks data into the tag block. Tag block is not copied so data
// should be retained and unchanged after being passed into this function.
func (blk *TagBlock) UnmarshalBinary(data []byte) error {
// Read trailer.
t, err := ReadTagBlockTrailer(data)
if err != nil {
return err
}
// Verify data size is correct.
if int64(len(data)) != t.Size {
return ErrTagBlockSizeMismatch
}
// Save data section.
blk.valueData = data[t.ValueData.Offset:]
blk.valueData = blk.valueData[:t.ValueData.Size]
// Save key data section.
blk.keyData = data[t.KeyData.Offset:]
blk.keyData = blk.keyData[:t.KeyData.Size]
// Save hash index block.
blk.hashData = data[t.HashIndex.Offset:]
blk.hashData = blk.hashData[:t.HashIndex.Size]
// Save entire block.
blk.data = data
return nil
}
// TagKeyElem returns an element for a tag key.
// Returns an element with a nil key if not found.
func (blk *TagBlock) TagKeyElem(key []byte) TagKeyElem {
keyN := int64(binary.BigEndian.Uint64(blk.hashData[:TagKeyNSize]))
hash := rhh.HashKey(key)
pos := hash % keyN
// Track current distance
var d int64
for {
// Find offset of tag key.
offset := binary.BigEndian.Uint64(blk.hashData[TagKeyNSize+(pos*TagKeyOffsetSize):])
if offset == 0 {
return nil
}
// Parse into element.
var e TagBlockKeyElem
e.unmarshal(blk.data[offset:], blk.data)
// Return if keys match.
if bytes.Equal(e.key, key) {
return &e
}
// Check if we've exceeded the probe distance.
if d > rhh.Dist(rhh.HashKey(e.key), pos, keyN) {
return nil
}
// Move position forward.
pos = (pos + 1) % keyN
d++
if d > keyN {
return nil
}
}
}
// TagValueElem returns an element for a tag value.
func (blk *TagBlock) TagValueElem(key, value []byte) TagValueElem {
// Find key element, exit if not found.
kelem, _ := blk.TagKeyElem(key).(*TagBlockKeyElem)
if kelem == nil {
return nil
}
// Slice hash index data.
hashData := kelem.hashIndex.buf
valueN := int64(binary.BigEndian.Uint64(hashData[:TagValueNSize]))
hash := rhh.HashKey(value)
pos := hash % valueN
// Track current distance
var d int64
for {
// Find offset of tag value.
offset := binary.BigEndian.Uint64(hashData[TagValueNSize+(pos*TagValueOffsetSize):])
if offset == 0 {
return nil
}
// Parse into element.
var e TagBlockValueElem
e.unmarshal(blk.data[offset:])
// Return if values match.
if bytes.Equal(e.value, value) {
return &e
}
// Check if we've exceeded the probe distance.
max := rhh.Dist(rhh.HashKey(e.value), pos, valueN)
if d > max {
return nil
}
// Move position forward.
pos = (pos + 1) % valueN
d++
if d > valueN {
return nil
}
}
}
// TagKeyIterator returns an iterator over all the keys in the block.
func (blk *TagBlock) TagKeyIterator() TagKeyIterator {
return &tagBlockKeyIterator{
blk: blk,
keyData: blk.keyData,
}
}
// tagBlockKeyIterator represents an iterator over all keys in a TagBlock.
type tagBlockKeyIterator struct {
blk *TagBlock
keyData []byte
e TagBlockKeyElem
}
// Next returns the next element in the iterator.
func (itr *tagBlockKeyIterator) Next() TagKeyElem {
// Exit when there is no data left.
if len(itr.keyData) == 0 {
return nil
}
// Unmarshal next element & move data forward.
itr.e.unmarshal(itr.keyData, itr.blk.data)
itr.keyData = itr.keyData[itr.e.size:]
assert(len(itr.e.Key()) > 0, "invalid zero-length tag key")
return &itr.e
}
// tagBlockValueIterator represents an iterator over all values for a tag key.
type tagBlockValueIterator struct {
data []byte
e TagBlockValueElem
}
// Next returns the next element in the iterator.
func (itr *tagBlockValueIterator) Next() TagValueElem {
// Exit when there is no data left.
if len(itr.data) == 0 {
return nil
}
// Unmarshal next element & move data forward.
itr.e.unmarshal(itr.data)
itr.data = itr.data[itr.e.size:]
assert(len(itr.e.Value()) > 0, "invalid zero-length tag value")
return &itr.e
}
// TagBlockKeyElem represents a tag key element in a TagBlock.
type TagBlockKeyElem struct {
flag byte
key []byte
// Value data
data struct {
offset uint64
size uint64
buf []byte
}
// Value hash index data
hashIndex struct {
offset uint64
size uint64
buf []byte
}
size int
// Reusable iterator.
itr tagBlockValueIterator
}
// Deleted returns true if the key has been tombstoned.
func (e *TagBlockKeyElem) Deleted() bool { return (e.flag & TagKeyTombstoneFlag) != 0 }
// Key returns the key name of the element.
func (e *TagBlockKeyElem) Key() []byte { return e.key }
// TagValueIterator returns an iterator over the key's values.
func (e *TagBlockKeyElem) TagValueIterator() TagValueIterator {
return &tagBlockValueIterator{data: e.data.buf}
}
// unmarshal unmarshals buf into e.
// The data argument represents the entire block data.
func (e *TagBlockKeyElem) unmarshal(buf, data []byte) {
start := len(buf)
// Parse flag data.
e.flag, buf = buf[0], buf[1:]
// Parse data offset/size.
e.data.offset, buf = binary.BigEndian.Uint64(buf), buf[8:]
e.data.size, buf = binary.BigEndian.Uint64(buf), buf[8:]
// Slice data.
e.data.buf = data[e.data.offset:]
e.data.buf = e.data.buf[:e.data.size]
// Parse hash index offset/size.
e.hashIndex.offset, buf = binary.BigEndian.Uint64(buf), buf[8:]
e.hashIndex.size, buf = binary.BigEndian.Uint64(buf), buf[8:]
// Slice hash index data.
e.hashIndex.buf = data[e.hashIndex.offset:]
e.hashIndex.buf = e.hashIndex.buf[:e.hashIndex.size]
// Parse key.
n, sz := binary.Uvarint(buf)
e.key, buf = buf[sz:sz+int(n)], buf[int(n)+sz:]
// Save length of elem.
e.size = start - len(buf)
}
// TagBlockValueElem represents a tag value element.
type TagBlockValueElem struct {
flag byte
value []byte
series struct {
n uint32 // Series count
data []byte // Raw series data
}
size int
}
// Deleted returns true if the element has been tombstoned.
func (e *TagBlockValueElem) Deleted() bool { return (e.flag & TagValueTombstoneFlag) != 0 }
// Value returns the value for the element.
func (e *TagBlockValueElem) Value() []byte { return e.value }
// SeriesN returns the series count.
func (e *TagBlockValueElem) SeriesN() uint32 { return e.series.n }
// SeriesData returns the raw series data.
func (e *TagBlockValueElem) SeriesData() []byte { return e.series.data }
// SeriesID returns series ID at an index.
func (e *TagBlockValueElem) SeriesID(i int) uint32 {
return binary.BigEndian.Uint32(e.series.data[i*SeriesIDSize:])
}
// SeriesIDs returns a list decoded series ids.
func (e *TagBlockValueElem) SeriesIDs() []uint32 {
a := make([]uint32, 0, e.series.n)
var prev uint32
for data := e.series.data; len(data) > 0; {
delta, n := binary.Uvarint(data)
data = data[n:]
seriesID := prev + uint32(delta)
a = append(a, seriesID)
prev = seriesID
}
return a
}
// Size returns the size of the element.
func (e *TagBlockValueElem) Size() int { return e.size }
// unmarshal unmarshals buf into e.
func (e *TagBlockValueElem) unmarshal(buf []byte) {
start := len(buf)
// Parse flag data.
e.flag, buf = buf[0], buf[1:]
// Parse value.
sz, n := binary.Uvarint(buf)
e.value, buf = buf[n:n+int(sz)], buf[n+int(sz):]
// Parse series count.
v, n := binary.Uvarint(buf)
e.series.n = uint32(v)
buf = buf[n:]
// Parse data block size.
sz, n = binary.Uvarint(buf)
buf = buf[n:]
// Save reference to series data.
e.series.data = buf[:sz]
buf = buf[sz:]
// Save length of elem.
e.size = start - len(buf)
}
// TagBlockTrailerSize is the total size of the on-disk trailer.
const TagBlockTrailerSize = 0 +
8 + 8 + // value data offset/size
8 + 8 + // key data offset/size
8 + 8 + // hash index offset/size
8 + // size
2 // version
// TagBlockTrailer represents meta data at the end of a TagBlock.
type TagBlockTrailer struct {
Version int // Encoding version
Size int64 // Total size w/ trailer
// Offset & size of value data section.
ValueData struct {
Offset int64
Size int64
}
// Offset & size of key data section.
KeyData struct {
Offset int64
Size int64
}
// Offset & size of hash map section.
HashIndex struct {
Offset int64
Size int64
}
}
// WriteTo writes the trailer to w.
func (t *TagBlockTrailer) WriteTo(w io.Writer) (n int64, err error) {
// Write data info.
if err := writeUint64To(w, uint64(t.ValueData.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.ValueData.Size), &n); err != nil {
return n, err
}
// Write key data info.
if err := writeUint64To(w, uint64(t.KeyData.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.KeyData.Size), &n); err != nil {
return n, err
}
// Write hash index info.
if err := writeUint64To(w, uint64(t.HashIndex.Offset), &n); err != nil {
return n, err
} else if err := writeUint64To(w, uint64(t.HashIndex.Size), &n); err != nil {
return n, err
}
// Write total size & encoding version.
if err := writeUint64To(w, uint64(t.Size), &n); err != nil {
return n, err
} else if err := writeUint16To(w, IndexFileVersion, &n); err != nil {
return n, err
}
return n, nil
}
// ReadTagBlockTrailer returns the tag block trailer from data.
func ReadTagBlockTrailer(data []byte) (TagBlockTrailer, error) {
var t TagBlockTrailer
// Read version.
t.Version = int(binary.BigEndian.Uint16(data[len(data)-2:]))
if t.Version != TagBlockVersion {
return t, ErrUnsupportedTagBlockVersion
}
// Slice trailer data.
buf := data[len(data)-TagBlockTrailerSize:]
// Read data section info.
t.ValueData.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.ValueData.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
// Read key section info.
t.KeyData.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.KeyData.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
// Read hash section info.
t.HashIndex.Offset, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
t.HashIndex.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
// Read total size.
t.Size, buf = int64(binary.BigEndian.Uint64(buf[0:8])), buf[8:]
return t, nil
}
// TagBlockEncoder encodes a tags to a TagBlock section.
type TagBlockEncoder struct {
w io.Writer
buf bytes.Buffer
// Track value offsets.
offsets *rhh.HashMap
// Track bytes written, sections.
n int64
trailer TagBlockTrailer
// Track tag keys.
keys []tagKeyEncodeEntry
}
// NewTagBlockEncoder returns a new TagBlockEncoder.
func NewTagBlockEncoder(w io.Writer) *TagBlockEncoder {
return &TagBlockEncoder{
w: w,
offsets: rhh.NewHashMap(rhh.Options{LoadFactor: LoadFactor}),
trailer: TagBlockTrailer{
Version: TagBlockVersion,
},
}
}
// N returns the number of bytes written.
func (enc *TagBlockEncoder) N() int64 { return enc.n }
// EncodeKey writes a tag key to the underlying writer.
func (enc *TagBlockEncoder) EncodeKey(key []byte, deleted bool) error {
// An initial empty byte must be written.
if err := enc.ensureHeaderWritten(); err != nil {
return err
}
// Verify key is lexicographically after previous key.
if len(enc.keys) > 0 {
prev := enc.keys[len(enc.keys)-1].key
if cmp := bytes.Compare(prev, key); cmp == 1 {
return fmt.Errorf("tag key out of order: prev=%s, new=%s", prev, key)
} else if cmp == 0 {
return fmt.Errorf("tag key already encoded: %s", key)
}
}
// Flush values section for key.
if err := enc.flushValueHashIndex(); err != nil {
return err
}
// Append key on to the end of the key list.
entry := tagKeyEncodeEntry{
key: key,
deleted: deleted,
}
entry.data.offset = enc.n
enc.keys = append(enc.keys, entry)
return nil
}
// EncodeValue writes a tag value to the underlying writer.
// The tag key must be lexicographical sorted after the previous encoded tag key.
func (enc *TagBlockEncoder) EncodeValue(value []byte, deleted bool, seriesIDs []uint32) error {
if len(enc.keys) == 0 {
return fmt.Errorf("tag key must be encoded before encoding values")
} else if len(value) == 0 {
return fmt.Errorf("zero length tag value not allowed")
}
// Save offset to hash map.
enc.offsets.Put(value, enc.n)
// Write flag.
if err := writeUint8To(enc.w, encodeTagValueFlag(deleted), &enc.n); err != nil {
return err
}
// Write value.
if err := writeUvarintTo(enc.w, uint64(len(value)), &enc.n); err != nil {
return err
} else if err := writeTo(enc.w, value, &enc.n); err != nil {
return err
}
// Build series data in buffer.
enc.buf.Reset()
var prev uint32
for _, seriesID := range seriesIDs {
delta := seriesID - prev
var buf [binary.MaxVarintLen32]byte
i := binary.PutUvarint(buf[:], uint64(delta))
if _, err := enc.buf.Write(buf[:i]); err != nil {
return err
}
prev = seriesID
}
// Write series count.
if err := writeUvarintTo(enc.w, uint64(len(seriesIDs)), &enc.n); err != nil {
return err
}
// Write data size & buffer.
if err := writeUvarintTo(enc.w, uint64(enc.buf.Len()), &enc.n); err != nil {
return err
}
nn, err := enc.buf.WriteTo(enc.w)
if enc.n += nn; err != nil {
return err
}
return nil
}
// Close flushes the trailer of the encoder to the writer.
func (enc *TagBlockEncoder) Close() error {
// Flush last value set.
if err := enc.ensureHeaderWritten(); err != nil {
return err
} else if err := enc.flushValueHashIndex(); err != nil {
return err
}
// Save ending position of entire data block.
enc.trailer.ValueData.Size = enc.n - enc.trailer.ValueData.Offset
// Write key block to point to value blocks.
if err := enc.encodeTagKeyBlock(); err != nil {
return err
}
// Compute total size w/ trailer.
enc.trailer.Size = enc.n + TagBlockTrailerSize
// Write trailer.
nn, err := enc.trailer.WriteTo(enc.w)
enc.n += nn
if err != nil {
return err
}
return nil
}
// ensureHeaderWritten writes a single byte to offset the rest of the block.
func (enc *TagBlockEncoder) ensureHeaderWritten() error {
if enc.n > 0 {
return nil
} else if _, err := enc.w.Write([]byte{0}); err != nil {
return err
}
enc.n++
enc.trailer.ValueData.Offset = enc.n
return nil
}
// flushValueHashIndex builds writes the hash map at the end of a value set.
func (enc *TagBlockEncoder) flushValueHashIndex() error {
// Ignore if no keys have been written.
if len(enc.keys) == 0 {
return nil
}
key := &enc.keys[len(enc.keys)-1]
// Save size of data section.
key.data.size = enc.n - key.data.offset
// Encode hash map length.
key.hashIndex.offset = enc.n
if err := writeUint64To(enc.w, uint64(enc.offsets.Cap()), &enc.n); err != nil {
return err
}
// Encode hash map offset entries.
for i := int64(0); i < enc.offsets.Cap(); i++ {
_, v := enc.offsets.Elem(i)
offset, _ := v.(int64)
if err := writeUint64To(enc.w, uint64(offset), &enc.n); err != nil {
return err
}
}
key.hashIndex.size = enc.n - key.hashIndex.offset
// Clear offsets.
enc.offsets = rhh.NewHashMap(rhh.Options{LoadFactor: LoadFactor})
return nil
}
// encodeTagKeyBlock encodes the keys section to the writer.
func (enc *TagBlockEncoder) encodeTagKeyBlock() error {
offsets := rhh.NewHashMap(rhh.Options{Capacity: int64(len(enc.keys)), LoadFactor: LoadFactor})
// Encode key list in sorted order.
enc.trailer.KeyData.Offset = enc.n
for i := range enc.keys {
entry := &enc.keys[i]
// Save current offset so we can use it in the hash index.
offsets.Put(entry.key, enc.n)
if err := writeUint8To(enc.w, encodeTagKeyFlag(entry.deleted), &enc.n); err != nil {
return err
}
// Write value data offset & size.
if err := writeUint64To(enc.w, uint64(entry.data.offset), &enc.n); err != nil {
return err
} else if err := writeUint64To(enc.w, uint64(entry.data.size), &enc.n); err != nil {
return err
}
// Write value hash index offset & size.
if err := writeUint64To(enc.w, uint64(entry.hashIndex.offset), &enc.n); err != nil {
return err
} else if err := writeUint64To(enc.w, uint64(entry.hashIndex.size), &enc.n); err != nil {
return err
}
// Write key length and data.
if err := writeUvarintTo(enc.w, uint64(len(entry.key)), &enc.n); err != nil {
return err
} else if err := writeTo(enc.w, entry.key, &enc.n); err != nil {
return err
}
}
enc.trailer.KeyData.Size = enc.n - enc.trailer.KeyData.Offset
// Encode hash map length.
enc.trailer.HashIndex.Offset = enc.n
if err := writeUint64To(enc.w, uint64(offsets.Cap()), &enc.n); err != nil {
return err
}
// Encode hash map offset entries.
for i := int64(0); i < offsets.Cap(); i++ {
_, v := offsets.Elem(i)
offset, _ := v.(int64)
if err := writeUint64To(enc.w, uint64(offset), &enc.n); err != nil {
return err
}
}
enc.trailer.HashIndex.Size = enc.n - enc.trailer.HashIndex.Offset
return nil
}
type tagKeyEncodeEntry struct {
key []byte
deleted bool
data struct {
offset int64
size int64
}
hashIndex struct {
offset int64
size int64
}
}
func encodeTagKeyFlag(deleted bool) byte {
var flag byte
if deleted {
flag |= TagKeyTombstoneFlag
}
return flag
}
func encodeTagValueFlag(deleted bool) byte {
var flag byte
if deleted {
flag |= TagValueTombstoneFlag
}
return flag
}

View File

@@ -0,0 +1,139 @@
package tsi1_test
import (
"bytes"
"fmt"
"reflect"
"testing"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Ensure tag blocks can be written and opened.
func TestTagBlockWriter(t *testing.T) {
// Write 3 series to writer.
var buf bytes.Buffer
enc := tsi1.NewTagBlockEncoder(&buf)
if err := enc.EncodeKey([]byte("host"), false); err != nil {
t.Fatal(err)
} else if err := enc.EncodeValue([]byte("server0"), false, []uint32{1}); err != nil {
t.Fatal(err)
} else if err := enc.EncodeValue([]byte("server1"), false, []uint32{2}); err != nil {
t.Fatal(err)
} else if err := enc.EncodeValue([]byte("server2"), false, []uint32{3}); err != nil {
t.Fatal(err)
}
if err := enc.EncodeKey([]byte("region"), false); err != nil {
t.Fatal(err)
} else if err := enc.EncodeValue([]byte("us-east"), false, []uint32{1, 2}); err != nil {
t.Fatal(err)
} else if err := enc.EncodeValue([]byte("us-west"), false, []uint32{3}); err != nil {
t.Fatal(err)
}
// Flush encoder.
if err := enc.Close(); err != nil {
t.Fatal(err)
} else if int(enc.N()) != buf.Len() {
t.Fatalf("bytes written mismatch: %d, expected %d", enc.N(), buf.Len())
}
// Unmarshal into a block.
var blk tsi1.TagBlock
if err := blk.UnmarshalBinary(buf.Bytes()); err != nil {
t.Fatal(err)
}
// Verify data.
if e := blk.TagValueElem([]byte("region"), []byte("us-east")); e == nil {
t.Fatal("expected element")
} else if a := e.(*tsi1.TagBlockValueElem).SeriesIDs(); !reflect.DeepEqual(a, []uint32{1, 2}) {
t.Fatalf("unexpected series ids: %#v", a)
}
if e := blk.TagValueElem([]byte("region"), []byte("us-west")); e == nil {
t.Fatal("expected element")
} else if a := e.(*tsi1.TagBlockValueElem).SeriesIDs(); !reflect.DeepEqual(a, []uint32{3}) {
t.Fatalf("unexpected series ids: %#v", a)
}
if e := blk.TagValueElem([]byte("host"), []byte("server0")); e == nil {
t.Fatal("expected element")
} else if a := e.(*tsi1.TagBlockValueElem).SeriesIDs(); !reflect.DeepEqual(a, []uint32{1}) {
t.Fatalf("unexpected series ids: %#v", a)
}
if e := blk.TagValueElem([]byte("host"), []byte("server1")); e == nil {
t.Fatal("expected element")
} else if a := e.(*tsi1.TagBlockValueElem).SeriesIDs(); !reflect.DeepEqual(a, []uint32{2}) {
t.Fatalf("unexpected series ids: %#v", a)
}
if e := blk.TagValueElem([]byte("host"), []byte("server2")); e == nil {
t.Fatal("expected element")
} else if a := e.(*tsi1.TagBlockValueElem).SeriesIDs(); !reflect.DeepEqual(a, []uint32{3}) {
t.Fatalf("unexpected series ids: %#v", a)
}
}
var benchmarkTagBlock10x1000 *tsi1.TagBlock
var benchmarkTagBlock100x1000 *tsi1.TagBlock
var benchmarkTagBlock1000x1000 *tsi1.TagBlock
var benchmarkTagBlock1x1000000 *tsi1.TagBlock
func BenchmarkTagBlock_SeriesN_10_1000(b *testing.B) {
benchmarkTagBlock_SeriesN(b, 10, 1000, &benchmarkTagBlock10x1000)
}
func BenchmarkTagBlock_SeriesN_100_1000(b *testing.B) {
benchmarkTagBlock_SeriesN(b, 100, 1000, &benchmarkTagBlock100x1000)
}
func BenchmarkTagBlock_SeriesN_1000_1000(b *testing.B) {
benchmarkTagBlock_SeriesN(b, 1000, 1000, &benchmarkTagBlock1000x1000)
}
func BenchmarkTagBlock_SeriesN_1_1000000(b *testing.B) {
benchmarkTagBlock_SeriesN(b, 1, 1000000, &benchmarkTagBlock1x1000000)
}
func benchmarkTagBlock_SeriesN(b *testing.B, tagN, valueN int, blk **tsi1.TagBlock) {
if (*blk) == nil {
var buf bytes.Buffer
enc := tsi1.NewTagBlockEncoder(&buf)
// Write block.
for i := 0; i < tagN; i++ {
if err := enc.EncodeKey([]byte(fmt.Sprintf("%08d", i)), false); err != nil {
b.Fatal(err)
}
for j := 0; j < valueN; j++ {
if err := enc.EncodeValue([]byte(fmt.Sprintf("%08d", j)), false, []uint32{1}); err != nil {
b.Fatal(err)
}
}
}
// Flush encoder.
if err := enc.Close(); err != nil {
b.Fatal(err)
}
b.Log("size", buf.Len())
// Unmarshal into a block.
*blk = &tsi1.TagBlock{}
if err := (*blk).UnmarshalBinary(buf.Bytes()); err != nil {
b.Fatal(err)
}
}
// Benchmark lookups.
b.ReportAllocs()
b.ResetTimer()
key, value := []byte("0"), []byte("0")
for i := 0; i < b.N; i++ {
if e := (*blk).TagValueElem(key, value); e == nil {
b.Fatal("expected element")
} else if n := e.(*tsi1.TagBlockValueElem).SeriesN(); n != 1 {
b.Fatalf("unexpected series count: %d", n)
}
}
}

View File

@@ -0,0 +1,818 @@
package tsi1
import (
"bytes"
"encoding/binary"
"encoding/hex"
"fmt"
"io"
"os"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
)
// LoadFactor is the fill percent for RHH indexes.
const LoadFactor = 80
// MeasurementElem represents a generic measurement element.
type MeasurementElem interface {
Name() []byte
Deleted() bool
}
// MeasurementElems represents a list of MeasurementElem.
type MeasurementElems []MeasurementElem
func (a MeasurementElems) Len() int { return len(a) }
func (a MeasurementElems) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a MeasurementElems) Less(i, j int) bool { return bytes.Compare(a[i].Name(), a[j].Name()) == -1 }
// MeasurementIterator represents a iterator over a list of measurements.
type MeasurementIterator interface {
Next() MeasurementElem
}
// MergeMeasurementIterators returns an iterator that merges a set of iterators.
// Iterators that are first in the list take precendence and a deletion by those
// early iterators will invalidate elements by later iterators.
func MergeMeasurementIterators(itrs ...MeasurementIterator) MeasurementIterator {
if len(itrs) == 0 {
return nil
}
return &measurementMergeIterator{
e: make(measurementMergeElem, 0, len(itrs)),
buf: make([]MeasurementElem, len(itrs)),
itrs: itrs,
}
}
type measurementMergeIterator struct {
e measurementMergeElem
buf []MeasurementElem
itrs []MeasurementIterator
}
// Next returns the element with the next lowest name across the iterators.
//
// If multiple iterators contain the same name then the first is returned
// and the remaining ones are skipped.
func (itr *measurementMergeIterator) Next() MeasurementElem {
// Find next lowest name amongst the buffers.
var name []byte
for i, buf := range itr.buf {
// Fill buffer if empty.
if buf == nil {
if buf = itr.itrs[i].Next(); buf != nil {
itr.buf[i] = buf
} else {
continue
}
}
// Find next lowest name.
if name == nil || bytes.Compare(itr.buf[i].Name(), name) == -1 {
name = itr.buf[i].Name()
}
}
// Return nil if no elements remaining.
if name == nil {
return nil
}
// Merge all elements together and clear buffers.
itr.e = itr.e[:0]
for i, buf := range itr.buf {
if buf == nil || !bytes.Equal(buf.Name(), name) {
continue
}
itr.e = append(itr.e, buf)
itr.buf[i] = nil
}
return itr.e
}
// measurementMergeElem represents a merged measurement element.
type measurementMergeElem []MeasurementElem
// Name returns the name of the first element.
func (p measurementMergeElem) Name() []byte {
if len(p) == 0 {
return nil
}
return p[0].Name()
}
// Deleted returns the deleted flag of the first element.
func (p measurementMergeElem) Deleted() bool {
if len(p) == 0 {
return false
}
return p[0].Deleted()
}
// filterUndeletedMeasurementIterator returns all measurements which are not deleted.
type filterUndeletedMeasurementIterator struct {
itr MeasurementIterator
}
// FilterUndeletedMeasurementIterator returns an iterator which filters all deleted measurement.
func FilterUndeletedMeasurementIterator(itr MeasurementIterator) MeasurementIterator {
if itr == nil {
return nil
}
return &filterUndeletedMeasurementIterator{itr: itr}
}
func (itr *filterUndeletedMeasurementIterator) Next() MeasurementElem {
for {
e := itr.itr.Next()
if e == nil {
return nil
} else if e.Deleted() {
continue
}
return e
}
}
// TagKeyElem represents a generic tag key element.
type TagKeyElem interface {
Key() []byte
Deleted() bool
TagValueIterator() TagValueIterator
}
// TagKeyIterator represents a iterator over a list of tag keys.
type TagKeyIterator interface {
Next() TagKeyElem
}
// MergeTagKeyIterators returns an iterator that merges a set of iterators.
// Iterators that are first in the list take precendence and a deletion by those
// early iterators will invalidate elements by later iterators.
func MergeTagKeyIterators(itrs ...TagKeyIterator) TagKeyIterator {
if len(itrs) == 0 {
return nil
}
return &tagKeyMergeIterator{
e: make(tagKeyMergeElem, 0, len(itrs)),
buf: make([]TagKeyElem, len(itrs)),
itrs: itrs,
}
}
type tagKeyMergeIterator struct {
e tagKeyMergeElem
buf []TagKeyElem
itrs []TagKeyIterator
}
// Next returns the element with the next lowest key across the iterators.
//
// If multiple iterators contain the same key then the first is returned
// and the remaining ones are skipped.
func (itr *tagKeyMergeIterator) Next() TagKeyElem {
// Find next lowest key amongst the buffers.
var key []byte
for i, buf := range itr.buf {
// Fill buffer.
if buf == nil {
if buf = itr.itrs[i].Next(); buf != nil {
itr.buf[i] = buf
} else {
continue
}
}
// Find next lowest key.
if key == nil || bytes.Compare(buf.Key(), key) == -1 {
key = buf.Key()
}
}
// Return nil if no elements remaining.
if key == nil {
return nil
}
// Merge elements together & clear buffer.
itr.e = itr.e[:0]
for i, buf := range itr.buf {
if buf == nil || !bytes.Equal(buf.Key(), key) {
continue
}
itr.e = append(itr.e, buf)
itr.buf[i] = nil
}
return itr.e
}
// tagKeyMergeElem represents a merged tag key element.
type tagKeyMergeElem []TagKeyElem
// Key returns the key of the first element.
func (p tagKeyMergeElem) Key() []byte {
if len(p) == 0 {
return nil
}
return p[0].Key()
}
// Deleted returns the deleted flag of the first element.
func (p tagKeyMergeElem) Deleted() bool {
if len(p) == 0 {
return false
}
return p[0].Deleted()
}
// TagValueIterator returns a merge iterator for all elements until a tombstone occurs.
func (p tagKeyMergeElem) TagValueIterator() TagValueIterator {
if len(p) == 0 {
return nil
}
a := make([]TagValueIterator, 0, len(p))
for _, e := range p {
itr := e.TagValueIterator()
a = append(a, itr)
if e.Deleted() {
break
}
}
return MergeTagValueIterators(a...)
}
// TagValueElem represents a generic tag value element.
type TagValueElem interface {
Value() []byte
Deleted() bool
}
// TagValueIterator represents a iterator over a list of tag values.
type TagValueIterator interface {
Next() TagValueElem
}
// MergeTagValueIterators returns an iterator that merges a set of iterators.
// Iterators that are first in the list take precendence and a deletion by those
// early iterators will invalidate elements by later iterators.
func MergeTagValueIterators(itrs ...TagValueIterator) TagValueIterator {
if len(itrs) == 0 {
return nil
}
return &tagValueMergeIterator{
e: make(tagValueMergeElem, 0, len(itrs)),
buf: make([]TagValueElem, len(itrs)),
itrs: itrs,
}
}
type tagValueMergeIterator struct {
e tagValueMergeElem
buf []TagValueElem
itrs []TagValueIterator
}
// Next returns the element with the next lowest value across the iterators.
//
// If multiple iterators contain the same value then the first is returned
// and the remaining ones are skipped.
func (itr *tagValueMergeIterator) Next() TagValueElem {
// Find next lowest value amongst the buffers.
var value []byte
for i, buf := range itr.buf {
// Fill buffer.
if buf == nil {
if buf = itr.itrs[i].Next(); buf != nil {
itr.buf[i] = buf
} else {
continue
}
}
// Find next lowest value.
if value == nil || bytes.Compare(buf.Value(), value) == -1 {
value = buf.Value()
}
}
// Return nil if no elements remaining.
if value == nil {
return nil
}
// Merge elements and clear buffers.
itr.e = itr.e[:0]
for i, buf := range itr.buf {
if buf == nil || !bytes.Equal(buf.Value(), value) {
continue
}
itr.e = append(itr.e, buf)
itr.buf[i] = nil
}
return itr.e
}
// tagValueMergeElem represents a merged tag value element.
type tagValueMergeElem []TagValueElem
// Name returns the value of the first element.
func (p tagValueMergeElem) Value() []byte {
if len(p) == 0 {
return nil
}
return p[0].Value()
}
// Deleted returns the deleted flag of the first element.
func (p tagValueMergeElem) Deleted() bool {
if len(p) == 0 {
return false
}
return p[0].Deleted()
}
// SeriesElem represents a generic series element.
type SeriesElem interface {
Name() []byte
Tags() models.Tags
Deleted() bool
// InfluxQL expression associated with series during filtering.
Expr() influxql.Expr
}
// SeriesElemKey encodes e as a series key.
func SeriesElemKey(e SeriesElem) []byte {
name, tags := e.Name(), e.Tags()
// TODO: Precompute allocation size.
// FIXME: Handle escaping.
var buf []byte
buf = append(buf, name...)
for _, t := range tags {
buf = append(buf, ',')
buf = append(buf, t.Key...)
buf = append(buf, '=')
buf = append(buf, t.Value...)
}
return buf
}
// CompareSeriesElem returns -1 if a < b, 1 if a > b, and 0 if equal.
func CompareSeriesElem(a, b SeriesElem) int {
if cmp := bytes.Compare(a.Name(), b.Name()); cmp != 0 {
return cmp
}
return models.CompareTags(a.Tags(), b.Tags())
}
// seriesElem represents an in-memory implementation of SeriesElem.
type seriesElem struct {
name []byte
tags models.Tags
deleted bool
}
func (e *seriesElem) Name() []byte { return e.name }
func (e *seriesElem) Tags() models.Tags { return e.tags }
func (e *seriesElem) Deleted() bool { return e.deleted }
func (e *seriesElem) Expr() influxql.Expr { return nil }
// SeriesIterator represents a iterator over a list of series.
type SeriesIterator interface {
Next() SeriesElem
}
// MergeSeriesIterators returns an iterator that merges a set of iterators.
// Iterators that are first in the list take precendence and a deletion by those
// early iterators will invalidate elements by later iterators.
func MergeSeriesIterators(itrs ...SeriesIterator) SeriesIterator {
if n := len(itrs); n == 0 {
return nil
} else if n == 1 {
return itrs[0]
}
return &seriesMergeIterator{
buf: make([]SeriesElem, len(itrs)),
itrs: itrs,
}
}
// seriesMergeIterator is an iterator that merges multiple iterators together.
type seriesMergeIterator struct {
buf []SeriesElem
itrs []SeriesIterator
}
// Next returns the element with the next lowest name/tags across the iterators.
//
// If multiple iterators contain the same name/tags then the first is returned
// and the remaining ones are skipped.
func (itr *seriesMergeIterator) Next() SeriesElem {
// Find next lowest name/tags amongst the buffers.
var name []byte
var tags models.Tags
for i, buf := range itr.buf {
// Fill buffer.
if buf == nil {
if buf = itr.itrs[i].Next(); buf != nil {
itr.buf[i] = buf
} else {
continue
}
}
// If the name is not set the pick the first non-empty name.
if name == nil {
name, tags = buf.Name(), buf.Tags()
continue
}
// Set name/tags if they are lower than what has been seen.
if cmp := bytes.Compare(buf.Name(), name); cmp == -1 || (cmp == 0 && models.CompareTags(buf.Tags(), tags) == -1) {
name, tags = buf.Name(), buf.Tags()
}
}
// Return nil if no elements remaining.
if name == nil {
return nil
}
// Refill buffer.
var e SeriesElem
for i, buf := range itr.buf {
if buf == nil || !bytes.Equal(buf.Name(), name) || models.CompareTags(buf.Tags(), tags) != 0 {
continue
}
// Copy first matching buffer to the return buffer.
if e == nil {
e = buf
}
// Clear buffer.
itr.buf[i] = nil
}
return e
}
// IntersectSeriesIterators returns an iterator that only returns series which
// occur in both iterators. If both series have associated expressions then
// they are combined together.
func IntersectSeriesIterators(itr0, itr1 SeriesIterator) SeriesIterator {
if itr0 == nil || itr1 == nil {
return nil
}
return &seriesIntersectIterator{itrs: [2]SeriesIterator{itr0, itr1}}
}
// seriesIntersectIterator is an iterator that merges two iterators together.
type seriesIntersectIterator struct {
e seriesExprElem
buf [2]SeriesElem
itrs [2]SeriesIterator
}
// Next returns the next element which occurs in both iterators.
func (itr *seriesIntersectIterator) Next() (e SeriesElem) {
for {
// Fill buffers.
if itr.buf[0] == nil {
itr.buf[0] = itr.itrs[0].Next()
}
if itr.buf[1] == nil {
itr.buf[1] = itr.itrs[1].Next()
}
// Exit if either buffer is still empty.
if itr.buf[0] == nil || itr.buf[1] == nil {
return nil
}
// Skip if both series are not equal.
if cmp := CompareSeriesElem(itr.buf[0], itr.buf[1]); cmp == -1 {
itr.buf[0] = nil
continue
} else if cmp == 1 {
itr.buf[1] = nil
continue
}
// Merge series together if equal.
itr.e.SeriesElem = itr.buf[0]
// Attach expression.
expr0 := itr.buf[0].Expr()
expr1 := itr.buf[1].Expr()
if expr0 == nil {
itr.e.expr = expr1
} else if expr1 == nil {
itr.e.expr = expr0
} else {
itr.e.expr = influxql.Reduce(&influxql.BinaryExpr{
Op: influxql.AND,
LHS: expr0,
RHS: expr1,
}, nil)
}
itr.buf[0], itr.buf[1] = nil, nil
return &itr.e
}
}
// UnionSeriesIterators returns an iterator that returns series from both
// both iterators. If both series have associated expressions then they are
// combined together.
func UnionSeriesIterators(itr0, itr1 SeriesIterator) SeriesIterator {
// Return other iterator if either one is nil.
if itr0 == nil {
return itr1
} else if itr1 == nil {
return itr0
}
return &seriesUnionIterator{itrs: [2]SeriesIterator{itr0, itr1}}
}
// seriesUnionIterator is an iterator that unions two iterators together.
type seriesUnionIterator struct {
e seriesExprElem
buf [2]SeriesElem
itrs [2]SeriesIterator
}
// Next returns the next element which occurs in both iterators.
func (itr *seriesUnionIterator) Next() (e SeriesElem) {
// Fill buffers.
if itr.buf[0] == nil {
itr.buf[0] = itr.itrs[0].Next()
}
if itr.buf[1] == nil {
itr.buf[1] = itr.itrs[1].Next()
}
// Return the other iterator if either one is empty.
if itr.buf[0] == nil {
e, itr.buf[1] = itr.buf[1], nil
return e
} else if itr.buf[1] == nil {
e, itr.buf[0] = itr.buf[0], nil
return e
}
// Return lesser series.
if cmp := CompareSeriesElem(itr.buf[0], itr.buf[1]); cmp == -1 {
e, itr.buf[0] = itr.buf[0], nil
return e
} else if cmp == 1 {
e, itr.buf[1] = itr.buf[1], nil
return e
}
// Attach element.
itr.e.SeriesElem = itr.buf[0]
// Attach expression.
expr0 := itr.buf[0].Expr()
expr1 := itr.buf[1].Expr()
if expr0 != nil && expr1 != nil {
itr.e.expr = influxql.Reduce(&influxql.BinaryExpr{
Op: influxql.OR,
LHS: expr0,
RHS: expr1,
}, nil)
} else {
itr.e.expr = nil
}
itr.buf[0], itr.buf[1] = nil, nil
return &itr.e
}
// DifferenceSeriesIterators returns an iterator that only returns series which
// occur the first iterator but not the second iterator.
func DifferenceSeriesIterators(itr0, itr1 SeriesIterator) SeriesIterator {
if itr0 != nil && itr1 == nil {
return itr0
} else if itr0 == nil {
return nil
}
return &seriesDifferenceIterator{itrs: [2]SeriesIterator{itr0, itr1}}
}
// seriesDifferenceIterator is an iterator that merges two iterators together.
type seriesDifferenceIterator struct {
buf [2]SeriesElem
itrs [2]SeriesIterator
}
// Next returns the next element which occurs only in the first iterator.
func (itr *seriesDifferenceIterator) Next() (e SeriesElem) {
for {
// Fill buffers.
if itr.buf[0] == nil {
itr.buf[0] = itr.itrs[0].Next()
}
if itr.buf[1] == nil {
itr.buf[1] = itr.itrs[1].Next()
}
// Exit if first buffer is still empty.
if itr.buf[0] == nil {
return nil
} else if itr.buf[1] == nil {
e, itr.buf[0] = itr.buf[0], nil
return e
}
// Return first series if it's less.
// If second series is less then skip it.
// If both series are equal then skip both.
if cmp := CompareSeriesElem(itr.buf[0], itr.buf[1]); cmp == -1 {
e, itr.buf[0] = itr.buf[0], nil
return e
} else if cmp == 1 {
itr.buf[1] = nil
continue
} else {
itr.buf[0], itr.buf[1] = nil, nil
continue
}
}
}
// filterUndeletedSeriesIterator returns all series which are not deleted.
type filterUndeletedSeriesIterator struct {
itr SeriesIterator
}
// FilterUndeletedSeriesIterator returns an iterator which filters all deleted series.
func FilterUndeletedSeriesIterator(itr SeriesIterator) SeriesIterator {
if itr == nil {
return nil
}
return &filterUndeletedSeriesIterator{itr: itr}
}
func (itr *filterUndeletedSeriesIterator) Next() SeriesElem {
for {
e := itr.itr.Next()
if e == nil {
return nil
} else if e.Deleted() {
continue
}
return e
}
}
// seriesExprElem holds a series and its associated filter expression.
type seriesExprElem struct {
SeriesElem
expr influxql.Expr
}
// Expr returns the associated expression.
func (e *seriesExprElem) Expr() influxql.Expr { return e.expr }
// seriesExprIterator is an iterator that attaches an associated expression.
type seriesExprIterator struct {
itr SeriesIterator
e seriesExprElem
}
// newSeriesExprIterator returns a new instance of seriesExprIterator.
func newSeriesExprIterator(itr SeriesIterator, expr influxql.Expr) SeriesIterator {
if itr == nil {
return nil
}
return &seriesExprIterator{
itr: itr,
e: seriesExprElem{
expr: expr,
},
}
}
// Next returns the next element in the iterator.
func (itr *seriesExprIterator) Next() SeriesElem {
itr.e.SeriesElem = itr.itr.Next()
if itr.e.SeriesElem == nil {
return nil
}
return &itr.e
}
// seriesIDIterator represents a iterator over a list of series ids.
type seriesIDIterator interface {
next() uint32
}
// writeTo writes write v into w. Updates n.
func writeTo(w io.Writer, v []byte, n *int64) error {
nn, err := w.Write(v)
*n += int64(nn)
return err
}
// writeUint8To writes write v into w. Updates n.
func writeUint8To(w io.Writer, v uint8, n *int64) error {
nn, err := w.Write([]byte{v})
*n += int64(nn)
return err
}
// writeUint16To writes write v into w using big endian encoding. Updates n.
func writeUint16To(w io.Writer, v uint16, n *int64) error {
var buf [2]byte
binary.BigEndian.PutUint16(buf[:], v)
nn, err := w.Write(buf[:])
*n += int64(nn)
return err
}
// writeUint32To writes write v into w using big endian encoding. Updates n.
func writeUint32To(w io.Writer, v uint32, n *int64) error {
var buf [4]byte
binary.BigEndian.PutUint32(buf[:], v)
nn, err := w.Write(buf[:])
*n += int64(nn)
return err
}
// writeUint64To writes write v into w using big endian encoding. Updates n.
func writeUint64To(w io.Writer, v uint64, n *int64) error {
var buf [8]byte
binary.BigEndian.PutUint64(buf[:], v)
nn, err := w.Write(buf[:])
*n += int64(nn)
return err
}
// writeUvarintTo writes write v into w using variable length encoding. Updates n.
func writeUvarintTo(w io.Writer, v uint64, n *int64) error {
var buf [binary.MaxVarintLen64]byte
i := binary.PutUvarint(buf[:], v)
nn, err := w.Write(buf[:i])
*n += int64(nn)
return err
}
type uint32Slice []uint32
func (a uint32Slice) Len() int { return len(a) }
func (a uint32Slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a uint32Slice) Less(i, j int) bool { return a[i] < a[j] }
type uint64Slice []uint64
func (a uint64Slice) Len() int { return len(a) }
func (a uint64Slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a uint64Slice) Less(i, j int) bool { return a[i] < a[j] }
type byteSlices [][]byte
func (a byteSlices) Len() int { return len(a) }
func (a byteSlices) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byteSlices) Less(i, j int) bool { return bytes.Compare(a[i], a[j]) == -1 }
// copyBytes returns a copy of b.
func copyBytes(b []byte) []byte {
if b == nil {
return nil
}
buf := make([]byte, len(b))
copy(buf, b)
return buf
}
// assert will panic with a given formatted message if the given condition is false.
func assert(condition bool, msg string, v ...interface{}) {
if !condition {
panic(fmt.Sprintf("assert failed: "+msg, v...))
}
}
type byTagKey []*influxql.TagSet
func (t byTagKey) Len() int { return len(t) }
func (t byTagKey) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) < 0 }
func (t byTagKey) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
// hexdump is a helper for dumping binary data to stderr.
func hexdump(data []byte) { os.Stderr.Write([]byte(hex.Dump(data))) }

View File

@@ -0,0 +1,308 @@
package tsi1_test
import (
"bytes"
"io/ioutil"
"reflect"
"testing"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb/index/tsi1"
)
// Ensure iterator can operate over an in-memory list of elements.
func TestMeasurementIterator(t *testing.T) {
elems := []MeasurementElem{
MeasurementElem{name: []byte("cpu"), deleted: true},
MeasurementElem{name: []byte("mem")},
}
itr := MeasurementIterator{Elems: elems}
if e := itr.Next(); !reflect.DeepEqual(&elems[0], e) {
t.Fatalf("unexpected elem(0): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(&elems[1], e) {
t.Fatalf("unexpected elem(1): %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can merge multiple iterators together.
func TestMergeMeasurementIterators(t *testing.T) {
itr := tsi1.MergeMeasurementIterators(
&MeasurementIterator{Elems: []MeasurementElem{
{name: []byte("aaa")},
{name: []byte("bbb"), deleted: true},
{name: []byte("ccc")},
}},
&MeasurementIterator{},
&MeasurementIterator{Elems: []MeasurementElem{
{name: []byte("bbb")},
{name: []byte("ccc"), deleted: true},
{name: []byte("ddd")},
}},
)
if e := itr.Next(); !bytes.Equal(e.Name(), []byte("aaa")) || e.Deleted() {
t.Fatalf("unexpected elem(0): %s/%v", e.Name(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Name(), []byte("bbb")) || !e.Deleted() {
t.Fatalf("unexpected elem(1): %s/%v", e.Name(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Name(), []byte("ccc")) || e.Deleted() {
t.Fatalf("unexpected elem(2): %s/%v", e.Name(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Name(), []byte("ddd")) || e.Deleted() {
t.Fatalf("unexpected elem(3): %s/%v", e.Name(), e.Deleted())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can operate over an in-memory list of tag key elements.
func TestTagKeyIterator(t *testing.T) {
elems := []TagKeyElem{
{key: []byte("aaa"), deleted: true},
{key: []byte("bbb")},
}
itr := TagKeyIterator{Elems: elems}
if e := itr.Next(); !reflect.DeepEqual(&elems[0], e) {
t.Fatalf("unexpected elem(0): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(&elems[1], e) {
t.Fatalf("unexpected elem(1): %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can merge multiple iterators together.
func TestMergeTagKeyIterators(t *testing.T) {
itr := tsi1.MergeTagKeyIterators(
&TagKeyIterator{Elems: []TagKeyElem{
{key: []byte("aaa")},
{key: []byte("bbb"), deleted: true},
{key: []byte("ccc")},
}},
&TagKeyIterator{},
&TagKeyIterator{Elems: []TagKeyElem{
{key: []byte("bbb")},
{key: []byte("ccc"), deleted: true},
{key: []byte("ddd")},
}},
)
if e := itr.Next(); !bytes.Equal(e.Key(), []byte("aaa")) || e.Deleted() {
t.Fatalf("unexpected elem(0): %s/%v", e.Key(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Key(), []byte("bbb")) || !e.Deleted() {
t.Fatalf("unexpected elem(1): %s/%v", e.Key(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Key(), []byte("ccc")) || e.Deleted() {
t.Fatalf("unexpected elem(2): %s/%v", e.Key(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Key(), []byte("ddd")) || e.Deleted() {
t.Fatalf("unexpected elem(3): %s/%v", e.Key(), e.Deleted())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can operate over an in-memory list of tag value elements.
func TestTagValueIterator(t *testing.T) {
elems := []TagValueElem{
{value: []byte("aaa"), deleted: true},
{value: []byte("bbb")},
}
itr := &TagValueIterator{Elems: elems}
if e := itr.Next(); !reflect.DeepEqual(&elems[0], e) {
t.Fatalf("unexpected elem(0): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(&elems[1], e) {
t.Fatalf("unexpected elem(1): %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can merge multiple iterators together.
func TestMergeTagValueIterators(t *testing.T) {
itr := tsi1.MergeTagValueIterators(
&TagValueIterator{Elems: []TagValueElem{
{value: []byte("aaa")},
{value: []byte("bbb"), deleted: true},
{value: []byte("ccc")},
}},
&TagValueIterator{},
&TagValueIterator{Elems: []TagValueElem{
{value: []byte("bbb")},
{value: []byte("ccc"), deleted: true},
{value: []byte("ddd")},
}},
)
if e := itr.Next(); !bytes.Equal(e.Value(), []byte("aaa")) || e.Deleted() {
t.Fatalf("unexpected elem(0): %s/%v", e.Value(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Value(), []byte("bbb")) || !e.Deleted() {
t.Fatalf("unexpected elem(1): %s/%v", e.Value(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Value(), []byte("ccc")) || e.Deleted() {
t.Fatalf("unexpected elem(2): %s/%v", e.Value(), e.Deleted())
} else if e := itr.Next(); !bytes.Equal(e.Value(), []byte("ddd")) || e.Deleted() {
t.Fatalf("unexpected elem(3): %s/%v", e.Value(), e.Deleted())
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can operate over an in-memory list of series.
func TestSeriesIterator(t *testing.T) {
elems := []SeriesElem{
{name: []byte("cpu"), tags: models.Tags{{Key: []byte("region"), Value: []byte("us-east")}}, deleted: true},
{name: []byte("mem")},
}
itr := SeriesIterator{Elems: elems}
if e := itr.Next(); !reflect.DeepEqual(&elems[0], e) {
t.Fatalf("unexpected elem(0): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(&elems[1], e) {
t.Fatalf("unexpected elem(1): %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// Ensure iterator can merge multiple iterators together.
func TestMergeSeriesIterators(t *testing.T) {
itr := tsi1.MergeSeriesIterators(
&SeriesIterator{Elems: []SeriesElem{
{name: []byte("aaa"), tags: models.Tags{{Key: []byte("region"), Value: []byte("us-east")}}, deleted: true},
{name: []byte("bbb"), deleted: true},
{name: []byte("ccc")},
}},
&SeriesIterator{},
&SeriesIterator{Elems: []SeriesElem{
{name: []byte("aaa"), tags: models.Tags{{Key: []byte("region"), Value: []byte("us-east")}}},
{name: []byte("aaa"), tags: models.Tags{{Key: []byte("region"), Value: []byte("us-west")}}},
{name: []byte("bbb")},
{name: []byte("ccc"), deleted: true},
{name: []byte("ddd")},
}},
)
if e := itr.Next(); !reflect.DeepEqual(e, &SeriesElem{name: []byte("aaa"), tags: models.Tags{{Key: []byte("region"), Value: []byte("us-east")}}, deleted: true}) {
t.Fatalf("unexpected elem(0): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(e, &SeriesElem{name: []byte("aaa"), tags: models.Tags{{Key: []byte("region"), Value: []byte("us-west")}}}) {
t.Fatalf("unexpected elem(1): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(e, &SeriesElem{name: []byte("bbb"), deleted: true}) {
t.Fatalf("unexpected elem(2): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(e, &SeriesElem{name: []byte("ccc")}) {
t.Fatalf("unexpected elem(3): %#v", e)
} else if e := itr.Next(); !reflect.DeepEqual(e, &SeriesElem{name: []byte("ddd")}) {
t.Fatalf("unexpected elem(4): %#v", e)
} else if e := itr.Next(); e != nil {
t.Fatalf("expected nil elem: %#v", e)
}
}
// MeasurementElem represents a test implementation of tsi1.MeasurementElem.
type MeasurementElem struct {
name []byte
deleted bool
}
func (e *MeasurementElem) Name() []byte { return e.name }
func (e *MeasurementElem) Deleted() bool { return e.deleted }
func (e *MeasurementElem) TagKeyIterator() tsi1.TagKeyIterator { return nil }
// MeasurementIterator represents an iterator over a slice of measurements.
type MeasurementIterator struct {
Elems []MeasurementElem
}
// Next returns the next element in the iterator.
func (itr *MeasurementIterator) Next() (e tsi1.MeasurementElem) {
if len(itr.Elems) == 0 {
return nil
}
e, itr.Elems = &itr.Elems[0], itr.Elems[1:]
return e
}
// TagKeyElem represents a test implementation of tsi1.TagKeyElem.
type TagKeyElem struct {
key []byte
deleted bool
}
func (e *TagKeyElem) Key() []byte { return e.key }
func (e *TagKeyElem) Deleted() bool { return e.deleted }
func (e *TagKeyElem) TagValueIterator() tsi1.TagValueIterator { return nil }
// TagKeyIterator represents an iterator over a slice of tag keys.
type TagKeyIterator struct {
Elems []TagKeyElem
}
// Next returns the next element in the iterator.
func (itr *TagKeyIterator) Next() (e tsi1.TagKeyElem) {
if len(itr.Elems) == 0 {
return nil
}
e, itr.Elems = &itr.Elems[0], itr.Elems[1:]
return e
}
// TagValueElem represents a test implementation of tsi1.TagValueElem.
type TagValueElem struct {
value []byte
deleted bool
}
func (e *TagValueElem) Value() []byte { return e.value }
func (e *TagValueElem) Deleted() bool { return e.deleted }
func (e *TagValueElem) SeriesIterator() tsi1.SeriesIterator { return nil }
// TagValueIterator represents an iterator over a slice of tag values.
type TagValueIterator struct {
Elems []TagValueElem
}
// Next returns the next element in the iterator.
func (itr *TagValueIterator) Next() (e tsi1.TagValueElem) {
if len(itr.Elems) == 0 {
return nil
}
e, itr.Elems = &itr.Elems[0], itr.Elems[1:]
return e
}
// SeriesElem represents a test implementation of tsi1.SeriesElem.
type SeriesElem struct {
name []byte
tags models.Tags
deleted bool
expr influxql.Expr
}
func (e *SeriesElem) Name() []byte { return e.name }
func (e *SeriesElem) Tags() models.Tags { return e.tags }
func (e *SeriesElem) Deleted() bool { return e.deleted }
func (e *SeriesElem) Expr() influxql.Expr { return e.expr }
// SeriesIterator represents an iterator over a slice of tag values.
type SeriesIterator struct {
Elems []SeriesElem
}
// Next returns the next element in the iterator.
func (itr *SeriesIterator) Next() (e tsi1.SeriesElem) {
if len(itr.Elems) == 0 {
return nil
}
e, itr.Elems = &itr.Elems[0], itr.Elems[1:]
return e
}
// MustTempDir returns a temporary directory. Panic on error.
func MustTempDir() string {
path, err := ioutil.TempDir("", "tsi-")
if err != nil {
panic(err)
}
return path
}

View File

@@ -0,0 +1,157 @@
// Code generated by protoc-gen-gogo.
// source: internal/meta.proto
// DO NOT EDIT!
/*
Package meta is a generated protocol buffer package.
It is generated from these files:
internal/meta.proto
It has these top-level messages:
Series
Tag
MeasurementFields
Field
*/
package meta
import proto "github.com/gogo/protobuf/proto"
import fmt "fmt"
import math "math"
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf
// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package
type Series struct {
Key *string `protobuf:"bytes,1,req,name=Key" json:"Key,omitempty"`
Tags []*Tag `protobuf:"bytes,2,rep,name=Tags" json:"Tags,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *Series) Reset() { *m = Series{} }
func (m *Series) String() string { return proto.CompactTextString(m) }
func (*Series) ProtoMessage() {}
func (*Series) Descriptor() ([]byte, []int) { return fileDescriptorMeta, []int{0} }
func (m *Series) GetKey() string {
if m != nil && m.Key != nil {
return *m.Key
}
return ""
}
func (m *Series) GetTags() []*Tag {
if m != nil {
return m.Tags
}
return nil
}
type Tag struct {
Key *string `protobuf:"bytes,1,req,name=Key" json:"Key,omitempty"`
Value *string `protobuf:"bytes,2,req,name=Value" json:"Value,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *Tag) Reset() { *m = Tag{} }
func (m *Tag) String() string { return proto.CompactTextString(m) }
func (*Tag) ProtoMessage() {}
func (*Tag) Descriptor() ([]byte, []int) { return fileDescriptorMeta, []int{1} }
func (m *Tag) GetKey() string {
if m != nil && m.Key != nil {
return *m.Key
}
return ""
}
func (m *Tag) GetValue() string {
if m != nil && m.Value != nil {
return *m.Value
}
return ""
}
type MeasurementFields struct {
Fields []*Field `protobuf:"bytes,1,rep,name=Fields" json:"Fields,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *MeasurementFields) Reset() { *m = MeasurementFields{} }
func (m *MeasurementFields) String() string { return proto.CompactTextString(m) }
func (*MeasurementFields) ProtoMessage() {}
func (*MeasurementFields) Descriptor() ([]byte, []int) { return fileDescriptorMeta, []int{2} }
func (m *MeasurementFields) GetFields() []*Field {
if m != nil {
return m.Fields
}
return nil
}
type Field struct {
ID *int32 `protobuf:"varint,1,req,name=ID" json:"ID,omitempty"`
Name *string `protobuf:"bytes,2,req,name=Name" json:"Name,omitempty"`
Type *int32 `protobuf:"varint,3,req,name=Type" json:"Type,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *Field) Reset() { *m = Field{} }
func (m *Field) String() string { return proto.CompactTextString(m) }
func (*Field) ProtoMessage() {}
func (*Field) Descriptor() ([]byte, []int) { return fileDescriptorMeta, []int{3} }
func (m *Field) GetID() int32 {
if m != nil && m.ID != nil {
return *m.ID
}
return 0
}
func (m *Field) GetName() string {
if m != nil && m.Name != nil {
return *m.Name
}
return ""
}
func (m *Field) GetType() int32 {
if m != nil && m.Type != nil {
return *m.Type
}
return 0
}
func init() {
proto.RegisterType((*Series)(nil), "meta.Series")
proto.RegisterType((*Tag)(nil), "meta.Tag")
proto.RegisterType((*MeasurementFields)(nil), "meta.MeasurementFields")
proto.RegisterType((*Field)(nil), "meta.Field")
}
func init() { proto.RegisterFile("internal/meta.proto", fileDescriptorMeta) }
var fileDescriptorMeta = []byte{
// 180 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x54, 0x8c, 0xbd, 0xca, 0xc2, 0x30,
0x14, 0x40, 0x69, 0xd2, 0x16, 0x7a, 0xfb, 0x7d, 0x83, 0x71, 0x30, 0xe0, 0x52, 0x33, 0x75, 0x6a,
0xc5, 0x67, 0x10, 0x41, 0x44, 0x17, 0x83, 0xfb, 0x05, 0x2f, 0xa5, 0xd0, 0x3f, 0x92, 0x74, 0xe8,
0xdb, 0x4b, 0x52, 0x17, 0xb7, 0x73, 0xee, 0xcf, 0x81, 0x6d, 0x3b, 0x38, 0x32, 0x03, 0x76, 0x75,
0x4f, 0x0e, 0xab, 0xc9, 0x8c, 0x6e, 0x14, 0xb1, 0x67, 0x55, 0x41, 0xfa, 0x24, 0xd3, 0x92, 0x15,
0x39, 0xf0, 0x1b, 0x2d, 0x32, 0x2a, 0x58, 0x99, 0x89, 0x1d, 0xc4, 0x1a, 0x1b, 0x2b, 0x59, 0xc1,
0xcb, 0xfc, 0x94, 0x55, 0xe1, 0x4f, 0x63, 0xa3, 0x0e, 0xc0, 0x35, 0x36, 0xbf, 0xc7, 0xff, 0x90,
0xbc, 0xb0, 0x9b, 0x49, 0x32, 0xaf, 0xea, 0x08, 0x9b, 0x3b, 0xa1, 0x9d, 0x0d, 0xf5, 0x34, 0xb8,
0x4b, 0x4b, 0xdd, 0xdb, 0x8a, 0x3d, 0xa4, 0x2b, 0xc9, 0x28, 0x24, 0xf3, 0x35, 0x19, 0x66, 0xaa,
0x86, 0x24, 0x80, 0x00, 0x60, 0xd7, 0x73, 0xa8, 0x26, 0xe2, 0x0f, 0xe2, 0x07, 0xf6, 0xdf, 0xa8,
0x37, 0xbd, 0x4c, 0x24, 0xb9, 0xdf, 0x7d, 0x02, 0x00, 0x00, 0xff, 0xff, 0x04, 0x3d, 0x58, 0x4a,
0xd1, 0x00, 0x00, 0x00,
}

View File

@@ -0,0 +1,27 @@
package meta;
//========================================================================
//
// Metadata
//
//========================================================================
message Series {
required string Key = 1;
repeated Tag Tags = 2;
}
message Tag {
required string Key = 1;
required string Value = 2;
}
message MeasurementFields {
repeated Field Fields = 1;
}
message Field {
required int32 ID = 1;
required string Name = 2;
required int32 Type = 3;
}

107
vendor/github.com/influxdata/influxdb/tsdb/meta.go generated vendored Normal file
View File

@@ -0,0 +1,107 @@
package tsdb
//go:generate protoc --gogo_out=. internal/meta.proto
import (
"sort"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/pkg/escape"
)
// MarshalTags converts a tag set to bytes for use as a lookup key.
func MarshalTags(tags map[string]string) []byte {
// Empty maps marshal to empty bytes.
if len(tags) == 0 {
return nil
}
// Extract keys and determine final size.
sz := (len(tags) * 2) - 1 // separators
keys := make([]string, 0, len(tags))
for k, v := range tags {
keys = append(keys, k)
sz += len(k) + len(v)
}
sort.Strings(keys)
// Generate marshaled bytes.
b := make([]byte, sz)
buf := b
for _, k := range keys {
copy(buf, k)
buf[len(k)] = '|'
buf = buf[len(k)+1:]
}
for i, k := range keys {
v := tags[k]
copy(buf, v)
if i < len(keys)-1 {
buf[len(v)] = '|'
buf = buf[len(v)+1:]
}
}
return b
}
// MakeTagsKey converts a tag set to bytes for use as a lookup key.
func MakeTagsKey(keys []string, tags models.Tags) []byte {
// precondition: keys is sorted
// precondition: models.Tags is sorted
// Empty maps marshal to empty bytes.
if len(keys) == 0 || len(tags) == 0 {
return nil
}
sel := make([]int, 0, len(keys))
sz := 0
i, j := 0, 0
for i < len(keys) && j < len(tags) {
if keys[i] < string(tags[j].Key) {
i++
} else if keys[i] > string(tags[j].Key) {
j++
} else {
sel = append(sel, j)
sz += len(keys[i]) + len(tags[j].Value)
i++
j++
}
}
if len(sel) == 0 {
// no tags matched the requested keys
return nil
}
sz += (len(sel) * 2) - 1 // selected tags, add separators
// Generate marshaled bytes.
b := make([]byte, sz)
buf := b
for _, k := range sel {
copy(buf, tags[k].Key)
buf[len(tags[k].Key)] = '|'
buf = buf[len(tags[k].Key)+1:]
}
for i, k := range sel {
copy(buf, tags[k].Value)
if i < len(sel)-1 {
buf[len(tags[k].Value)] = '|'
buf = buf[len(tags[k].Value)+1:]
}
}
return b
}
// MeasurementFromSeriesKey returns the name of the measurement from a key that
// contains a measurement name.
func MeasurementFromSeriesKey(key []byte) []byte {
// Ignoring the error because the func returns "missing fields"
k, _ := models.ParseName(key)
return escape.Unescape(k)
}

260
vendor/github.com/influxdata/influxdb/tsdb/meta_test.go generated vendored Normal file
View File

@@ -0,0 +1,260 @@
package tsdb_test
import (
"bytes"
"fmt"
"testing"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxdb/tsdb"
"github.com/influxdata/influxdb/tsdb/index/inmem"
)
// Ensure tags can be marshaled into a byte slice.
func TestMarshalTags(t *testing.T) {
for i, tt := range []struct {
tags map[string]string
result []byte
}{
{
tags: nil,
result: nil,
},
{
tags: map[string]string{"foo": "bar"},
result: []byte(`foo|bar`),
},
{
tags: map[string]string{"foo": "bar", "baz": "battttt"},
result: []byte(`baz|foo|battttt|bar`),
},
{
tags: map[string]string{"baz": "battttt", "foo": "bar"},
result: []byte(`baz|foo|battttt|bar`),
},
} {
result := tsdb.MarshalTags(tt.tags)
if !bytes.Equal(result, tt.result) {
t.Fatalf("%d. unexpected result: exp=%s, got=%s", i, tt.result, result)
}
}
}
func BenchmarkMarshalTags_KeyN1(b *testing.B) { benchmarkMarshalTags(b, 1) }
func BenchmarkMarshalTags_KeyN3(b *testing.B) { benchmarkMarshalTags(b, 3) }
func BenchmarkMarshalTags_KeyN5(b *testing.B) { benchmarkMarshalTags(b, 5) }
func BenchmarkMarshalTags_KeyN10(b *testing.B) { benchmarkMarshalTags(b, 10) }
func benchmarkMarshalTags(b *testing.B, keyN int) {
const keySize, valueSize = 8, 15
// Generate tag map.
tags := make(map[string]string)
for i := 0; i < keyN; i++ {
tags[fmt.Sprintf("%0*d", keySize, i)] = fmt.Sprintf("%0*d", valueSize, i)
}
// Unmarshal map into byte slice.
b.ReportAllocs()
for i := 0; i < b.N; i++ {
tsdb.MarshalTags(tags)
}
}
// Ensure tags can be marshaled into a byte slice.
func TestMakeTagsKey(t *testing.T) {
for i, tt := range []struct {
keys []string
tags models.Tags
result []byte
}{
{
keys: nil,
tags: nil,
result: nil,
},
{
keys: []string{"foo"},
tags: models.NewTags(map[string]string{"foo": "bar"}),
result: []byte(`foo|bar`),
},
{
keys: []string{"foo"},
tags: models.NewTags(map[string]string{"baz": "battttt"}),
result: []byte(``),
},
{
keys: []string{"baz", "foo"},
tags: models.NewTags(map[string]string{"baz": "battttt"}),
result: []byte(`baz|battttt`),
},
{
keys: []string{"baz", "foo", "zzz"},
tags: models.NewTags(map[string]string{"foo": "bar"}),
result: []byte(`foo|bar`),
},
{
keys: []string{"baz", "foo"},
tags: models.NewTags(map[string]string{"foo": "bar", "baz": "battttt"}),
result: []byte(`baz|foo|battttt|bar`),
},
{
keys: []string{"baz"},
tags: models.NewTags(map[string]string{"baz": "battttt", "foo": "bar"}),
result: []byte(`baz|battttt`),
},
} {
result := tsdb.MakeTagsKey(tt.keys, tt.tags)
if !bytes.Equal(result, tt.result) {
t.Fatalf("%d. unexpected result: exp=%s, got=%s", i, tt.result, result)
}
}
}
func BenchmarkMakeTagsKey_KeyN1(b *testing.B) { benchmarkMakeTagsKey(b, 1) }
func BenchmarkMakeTagsKey_KeyN3(b *testing.B) { benchmarkMakeTagsKey(b, 3) }
func BenchmarkMakeTagsKey_KeyN5(b *testing.B) { benchmarkMakeTagsKey(b, 5) }
func BenchmarkMakeTagsKey_KeyN10(b *testing.B) { benchmarkMakeTagsKey(b, 10) }
func makeTagsAndKeys(keyN int) ([]string, models.Tags) {
const keySize, valueSize = 8, 15
// Generate tag map.
keys := make([]string, keyN)
tags := make(map[string]string)
for i := 0; i < keyN; i++ {
keys[i] = fmt.Sprintf("%0*d", keySize, i)
tags[keys[i]] = fmt.Sprintf("%0*d", valueSize, i)
}
return keys, models.NewTags(tags)
}
func benchmarkMakeTagsKey(b *testing.B, keyN int) {
keys, tags := makeTagsAndKeys(keyN)
// Unmarshal map into byte slice.
b.ReportAllocs()
for i := 0; i < b.N; i++ {
tsdb.MakeTagsKey(keys, tags)
}
}
type TestSeries struct {
Measurement string
Series *inmem.Series
}
func genTestSeries(mCnt, tCnt, vCnt int) []*TestSeries {
measurements := genStrList("measurement", mCnt)
tagSets := NewTagSetGenerator(tCnt, vCnt).AllSets()
series := []*TestSeries{}
for _, m := range measurements {
for _, ts := range tagSets {
series = append(series, &TestSeries{
Measurement: m,
Series: inmem.NewSeries([]byte(fmt.Sprintf("%s:%s", m, string(tsdb.MarshalTags(ts)))), models.NewTags(ts)),
})
}
}
return series
}
type TagValGenerator struct {
Key string
Vals []string
idx int
}
func NewTagValGenerator(tagKey string, nVals int) *TagValGenerator {
tvg := &TagValGenerator{Key: tagKey}
for i := 0; i < nVals; i++ {
tvg.Vals = append(tvg.Vals, fmt.Sprintf("tagValue%d", i))
}
return tvg
}
func (tvg *TagValGenerator) First() string {
tvg.idx = 0
return tvg.Curr()
}
func (tvg *TagValGenerator) Curr() string {
return tvg.Vals[tvg.idx]
}
func (tvg *TagValGenerator) Next() string {
tvg.idx++
if tvg.idx >= len(tvg.Vals) {
tvg.idx--
return ""
}
return tvg.Curr()
}
type TagSet map[string]string
type TagSetGenerator struct {
TagVals []*TagValGenerator
}
func NewTagSetGenerator(nSets int, nTagVals ...int) *TagSetGenerator {
tsg := &TagSetGenerator{}
for i := 0; i < nSets; i++ {
nVals := nTagVals[0]
if i < len(nTagVals) {
nVals = nTagVals[i]
}
tagKey := fmt.Sprintf("tagKey%d", i)
tsg.TagVals = append(tsg.TagVals, NewTagValGenerator(tagKey, nVals))
}
return tsg
}
func (tsg *TagSetGenerator) First() TagSet {
for _, tsv := range tsg.TagVals {
tsv.First()
}
return tsg.Curr()
}
func (tsg *TagSetGenerator) Curr() TagSet {
ts := TagSet{}
for _, tvg := range tsg.TagVals {
ts[tvg.Key] = tvg.Curr()
}
return ts
}
func (tsg *TagSetGenerator) Next() TagSet {
val := ""
for _, tsv := range tsg.TagVals {
if val = tsv.Next(); val != "" {
break
} else {
tsv.First()
}
}
if val == "" {
return nil
}
return tsg.Curr()
}
func (tsg *TagSetGenerator) AllSets() []TagSet {
allSets := []TagSet{}
for ts := tsg.First(); ts != nil; ts = tsg.Next() {
allSets = append(allSets, ts)
}
return allSets
}
func genStrList(prefix string, n int) []string {
lst := make([]string, 0, n)
for i := 0; i < n; i++ {
lst = append(lst, fmt.Sprintf("%s%d", prefix, i))
}
return lst
}

1651
vendor/github.com/influxdata/influxdb/tsdb/shard.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,256 @@
package tsdb
import (
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"regexp"
"sort"
"strings"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/influxdata/influxdb/influxql"
"github.com/influxdata/influxdb/models"
)
func TestShard_MapType(t *testing.T) {
var sh *TempShard
setup := func(index string) {
sh = NewTempShard(index)
if err := sh.Open(); err != nil {
t.Fatal(err)
}
sh.MustWritePointsString(`
cpu,host=serverA,region=uswest value=100 0
cpu,host=serverA,region=uswest value=50,val2=5 10
cpu,host=serverB,region=uswest value=25 0
mem,host=serverA value=25i 0
mem,host=serverB value=50i,val3=t 10
_reserved,region=uswest value="foo" 0
`)
}
for _, index := range RegisteredIndexes() {
setup(index)
for _, tt := range []struct {
measurement string
field string
typ influxql.DataType
}{
{
measurement: "cpu",
field: "value",
typ: influxql.Float,
},
{
measurement: "cpu",
field: "host",
typ: influxql.Tag,
},
{
measurement: "cpu",
field: "region",
typ: influxql.Tag,
},
{
measurement: "cpu",
field: "val2",
typ: influxql.Float,
},
{
measurement: "cpu",
field: "unknown",
typ: influxql.Unknown,
},
{
measurement: "mem",
field: "value",
typ: influxql.Integer,
},
{
measurement: "mem",
field: "val3",
typ: influxql.Boolean,
},
{
measurement: "mem",
field: "host",
typ: influxql.Tag,
},
{
measurement: "unknown",
field: "unknown",
typ: influxql.Unknown,
},
{
measurement: "_fieldKeys",
field: "fieldKey",
typ: influxql.String,
},
{
measurement: "_fieldKeys",
field: "fieldType",
typ: influxql.String,
},
{
measurement: "_fieldKeys",
field: "unknown",
typ: influxql.Unknown,
},
{
measurement: "_series",
field: "key",
typ: influxql.String,
},
{
measurement: "_series",
field: "unknown",
typ: influxql.Unknown,
},
{
measurement: "_tagKeys",
field: "tagKey",
typ: influxql.String,
},
{
measurement: "_tagKeys",
field: "unknown",
typ: influxql.Unknown,
},
{
measurement: "_reserved",
field: "value",
typ: influxql.String,
},
{
measurement: "_reserved",
field: "region",
typ: influxql.Tag,
},
} {
name := fmt.Sprintf("%s_%s_%s", index, tt.measurement, tt.field)
t.Run(name, func(t *testing.T) {
typ, err := sh.mapType(tt.measurement, tt.field)
if err != nil {
t.Fatal(err)
}
if have, want := typ, tt.typ; have != want {
t.Errorf("unexpected data type: have=%#v want=%#v", have, want)
}
})
}
sh.Close()
}
}
func TestShard_MeasurementsByRegex(t *testing.T) {
var sh *TempShard
setup := func(index string) {
sh = NewTempShard(index)
if err := sh.Open(); err != nil {
t.Fatal(err)
}
sh.MustWritePointsString(`
cpu,host=serverA,region=uswest value=100 0
cpu,host=serverA,region=uswest value=50,val2=5 10
cpu,host=serverB,region=uswest value=25 0
mem,host=serverA value=25i 0
mem,host=serverB value=50i,val3=t 10
`)
}
for _, index := range RegisteredIndexes() {
setup(index)
for _, tt := range []struct {
regex string
measurements []string
}{
{regex: `cpu`, measurements: []string{"cpu"}},
{regex: `mem`, measurements: []string{"mem"}},
{regex: `cpu|mem`, measurements: []string{"cpu", "mem"}},
{regex: `gpu`, measurements: []string{}},
{regex: `pu`, measurements: []string{"cpu"}},
{regex: `p|m`, measurements: []string{"cpu", "mem"}},
} {
t.Run(index+"_"+tt.regex, func(t *testing.T) {
re := regexp.MustCompile(tt.regex)
measurements, err := sh.MeasurementNamesByRegex(re)
if err != nil {
t.Fatal(err)
}
mstrings := make([]string, 0, len(measurements))
for _, name := range measurements {
mstrings = append(mstrings, string(name))
}
sort.Strings(mstrings)
if diff := cmp.Diff(tt.measurements, mstrings, cmpopts.EquateEmpty()); diff != "" {
t.Errorf("unexpected measurements:\n%s", diff)
}
})
}
sh.Close()
}
}
// TempShard represents a test wrapper for Shard that uses temporary
// filesystem paths.
type TempShard struct {
*Shard
path string
}
// NewTempShard returns a new instance of TempShard with temp paths.
func NewTempShard(index string) *TempShard {
// Create temporary path for data and WAL.
dir, err := ioutil.TempDir("", "influxdb-tsdb-")
if err != nil {
panic(err)
}
// Build engine options.
opt := NewEngineOptions()
opt.IndexVersion = index
opt.Config.WALDir = filepath.Join(dir, "wal")
if index == "inmem" {
opt.InmemIndex, _ = NewInmemIndex(path.Base(dir))
}
return &TempShard{
Shard: NewShard(0,
filepath.Join(dir, "data", "db0", "rp0", "1"),
filepath.Join(dir, "wal", "db0", "rp0", "1"),
opt,
),
path: dir,
}
}
// Close closes the shard and removes all underlying data.
func (sh *TempShard) Close() error {
defer os.RemoveAll(sh.path)
return sh.Shard.Close()
}
// MustWritePointsString parses the line protocol (with second precision) and
// inserts the resulting points into the shard. Panic on error.
func (sh *TempShard) MustWritePointsString(s string) {
a, err := models.ParsePointsWithPrecision([]byte(strings.TrimSpace(s)), time.Time{}, "s")
if err != nil {
panic(err)
}
if err := sh.WritePoints(a); err != nil {
panic(err)
}
}

1162
vendor/github.com/influxdata/influxdb/tsdb/shard_test.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

1389
vendor/github.com/influxdata/influxdb/tsdb/store.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,167 @@
package tsdb
import (
"fmt"
"reflect"
"sort"
"testing"
)
func TestStore_mergeTagValues(t *testing.T) {
examples := []struct {
in []tagValues
out TagValues
}{
{},
{in: make([]tagValues, 4), out: TagValues{Values: []KeyValue{}}},
{
in: []tagValues{createtagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}})},
out: createTagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}}),
},
out: createTagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{"host": {"server-a", "server-d", "server-e"}}),
},
out: createTagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c", "server-d", "server-e"}}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"host": {"server-a"}}),
createtagValues("m0", map[string][]string{}),
createtagValues("m0", map[string][]string{"host": {"server-a"}}),
},
out: createTagValues("m0", map[string][]string{"host": {"server-a"}}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"host": {"server-q", "server-z"}}),
createtagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{"host": {"server-a", "server-d", "server-e"}}),
createtagValues("m0", map[string][]string{"host": {"server-e", "server-q", "server-z"}}),
createtagValues("m0", map[string][]string{"host": {"server-a"}}),
},
out: createTagValues("m0", map[string][]string{"host": {"server-a", "server-b", "server-c", "server-d", "server-e", "server-q", "server-z"}}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"a": {"0", "1"}, "host1": {"server-q", "server-z"}}),
createtagValues("m0", map[string][]string{"a": {"0", "2"}, "host2": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{"a": {"0", "3"}, "host3": {"server-a", "server-d", "server-e"}}),
createtagValues("m0", map[string][]string{"a": {"0", "4"}, "host4": {"server-e", "server-q", "server-z"}}),
createtagValues("m0", map[string][]string{"a": {"0", "5"}, "host5": {"server-a"}}),
},
out: createTagValues("m0", map[string][]string{
"a": {"0", "1", "2", "3", "4", "5"},
"host1": {"server-q", "server-z"},
"host2": {"server-a", "server-b", "server-c"},
"host3": {"server-a", "server-d", "server-e"},
"host4": {"server-e", "server-q", "server-z"},
"host5": {"server-a"},
}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"region": {"east-1", "west-1"}, "host": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{"region": {"north-1", "west-1"}, "host": {"server-a", "server-d", "server-e"}}),
},
out: createTagValues("m0", map[string][]string{
"host": {"server-a", "server-b", "server-c", "server-d", "server-e"},
"region": {"east-1", "north-1", "west-1"},
}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"region": {"east-1", "west-1"}, "host": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{"city": {"Baltimore", "Las Vegas"}}),
},
out: createTagValues("m0", map[string][]string{
"city": {"Baltimore", "Las Vegas"},
"host": {"server-a", "server-b", "server-c"},
"region": {"east-1", "west-1"},
}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"city": {"Baltimore", "Las Vegas"}}),
createtagValues("m0", map[string][]string{"region": {"east-1", "west-1"}, "host": {"server-a", "server-b", "server-c"}}),
},
out: createTagValues("m0", map[string][]string{
"city": {"Baltimore", "Las Vegas"},
"host": {"server-a", "server-b", "server-c"},
"region": {"east-1", "west-1"},
}),
},
{
in: []tagValues{
createtagValues("m0", map[string][]string{"region": {"east-1", "west-1"}, "host": {"server-a", "server-b", "server-c"}}),
createtagValues("m0", map[string][]string{}),
},
out: createTagValues("m0", map[string][]string{
"host": {"server-a", "server-b", "server-c"},
"region": {"east-1", "west-1"},
}),
},
}
buf := make([][2]int, 10)
for i, example := range examples {
t.Run(fmt.Sprintf("example_%d", i+1), func(t *testing.T) {
if got, exp := mergeTagValues(buf, example.in...), example.out; !reflect.DeepEqual(got, exp) {
t.Fatalf("\ngot\n %#v\n\n expected\n %#v", got, exp)
}
})
}
}
// Helper to create some tagValues.
func createtagValues(mname string, kvs map[string][]string) tagValues {
out := tagValues{
name: []byte(mname),
keys: make([]string, 0, len(kvs)),
values: make([][]string, len(kvs)),
}
for k := range kvs {
out.keys = append(out.keys, k)
}
sort.Sort(sort.StringSlice(out.keys))
for i, k := range out.keys {
values := kvs[k]
sort.Sort(sort.StringSlice(values))
out.values[i] = values
}
return out
}
// Helper to create some TagValues
func createTagValues(mname string, kvs map[string][]string) TagValues {
var sz int
for _, v := range kvs {
sz += len(v)
}
out := TagValues{
Measurement: mname,
Values: make([]KeyValue, 0, sz),
}
for tk, tvs := range kvs {
for _, tv := range tvs {
out.Values = append(out.Values, KeyValue{Key: tk, Value: tv})
}
// We have to sort the KeyValues since that's how they're provided from
// the Store.
sort.Sort(KeyValues(out.Values))
}
return out
}

1261
vendor/github.com/influxdata/influxdb/tsdb/store_test.go generated vendored Normal file

File diff suppressed because it is too large Load Diff