package tsm1 // Compactions are the process of creating read-optimized TSM files. // The files are created by converting write-optimized WAL entries // to read-optimized TSM format. They can also be created from existing // TSM files when there are tombstone records that neeed to be removed, points // that were overwritten by later writes and need to updated, or multiple // smaller TSM files need to be merged to reduce file counts and improve // compression ratios. // // The compaction process is stream-oriented using multiple readers and // iterators. The resulting stream is written sorted and chunked to allow for // one-pass writing of a new TSM file. import ( "fmt" "math" "os" "path/filepath" "runtime" "sort" "sync" "sync/atomic" "time" "github.com/influxdata/influxdb/tsdb" ) const maxTSMFileSize = uint32(2048 * 1024 * 1024) // 2GB const ( // CompactionTempExtension is the extension used for temporary files created during compaction. CompactionTempExtension = "tmp" // TSMFileExtension is the extension used for TSM files. TSMFileExtension = "tsm" ) var ( errMaxFileExceeded = fmt.Errorf("max file exceeded") errSnapshotsDisabled = fmt.Errorf("snapshots disabled") errCompactionsDisabled = fmt.Errorf("compactions disabled") errCompactionAborted = fmt.Errorf("compaction aborted") ) type errCompactionInProgress struct { err error } // Error returns the string representation of the error, to satisfy the error interface. func (e errCompactionInProgress) Error() string { if e.err != nil { return fmt.Sprintf("compaction in progress: %s", e.err) } return "compaction in progress" } // CompactionGroup represents a list of files eligible to be compacted together. type CompactionGroup []string // CompactionPlanner determines what TSM files and WAL segments to include in a // given compaction run. type CompactionPlanner interface { Plan(lastWrite time.Time) []CompactionGroup PlanLevel(level int) []CompactionGroup PlanOptimize() []CompactionGroup Release(group []CompactionGroup) FullyCompacted() bool } // DefaultPlanner implements CompactionPlanner using a strategy to roll up // multiple generations of TSM files into larger files in stages. It attempts // to minimize the number of TSM files on disk while rolling up a bounder number // of files. type DefaultPlanner struct { FileStore fileStore // compactFullWriteColdDuration specifies the length of time after // which if no writes have been committed to the WAL, the engine will // do a full compaction of the TSM files in this shard. This duration // should always be greater than the CacheFlushWriteColdDuraion compactFullWriteColdDuration time.Duration // lastPlanCheck is the last time Plan was called lastPlanCheck time.Time mu sync.RWMutex // lastFindGenerations is the last time findGenerations was run lastFindGenerations time.Time // lastGenerations is the last set of generations found by findGenerations lastGenerations tsmGenerations // filesInUse is the set of files that have been returned as part of a plan and might // be being compacted. Two plans should not return the same file at any given time. filesInUse map[string]struct{} } type fileStore interface { Stats() []FileStat LastModified() time.Time BlockCount(path string, idx int) int } func NewDefaultPlanner(fs fileStore, writeColdDuration time.Duration) *DefaultPlanner { return &DefaultPlanner{ FileStore: fs, compactFullWriteColdDuration: writeColdDuration, filesInUse: make(map[string]struct{}), } } // tsmGeneration represents the TSM files within a generation. // 000001-01.tsm, 000001-02.tsm would be in the same generation // 000001 each with different sequence numbers. type tsmGeneration struct { id int files []FileStat } // size returns the total size of the files in the generation. func (t *tsmGeneration) size() uint64 { var n uint64 for _, f := range t.files { n += uint64(f.Size) } return n } // compactionLevel returns the level of the files in this generation. func (t *tsmGeneration) level() int { // Level 0 is always created from the result of a cache compaction. It generates // 1 file with a sequence num of 1. Level 2 is generated by compacting multiple // level 1 files. Level 3 is generate by compacting multiple level 2 files. Level // 4 is for anything else. _, seq, _ := ParseTSMFileName(t.files[len(t.files)-1].Path) if seq < 4 { return seq } return 4 } // count returns the number of files in the generation. func (t *tsmGeneration) count() int { return len(t.files) } // hasTombstones returns true if there are keys removed for any of the files. func (t *tsmGeneration) hasTombstones() bool { for _, f := range t.files { if f.HasTombstone { return true } } return false } // FullyCompacted returns true if the shard is fully compacted. func (c *DefaultPlanner) FullyCompacted() bool { gens := c.findGenerations() return len(gens) <= 1 && !gens.hasTombstones() } // PlanLevel returns a set of TSM files to rewrite for a specific level. func (c *DefaultPlanner) PlanLevel(level int) []CompactionGroup { // Determine the generations from all files on disk. We need to treat // a generation conceptually as a single file even though it may be // split across several files in sequence. generations := c.findGenerations() // If there is only one generation and no tombstones, then there's nothing to // do. if len(generations) <= 1 && !generations.hasTombstones() { return nil } // Group each generation by level such that two adjacent generations in the same // level become part of the same group. var currentGen tsmGenerations var groups []tsmGenerations for i := 0; i < len(generations); i++ { cur := generations[i] if len(currentGen) == 0 || currentGen.level() == cur.level() { currentGen = append(currentGen, cur) continue } groups = append(groups, currentGen) currentGen = tsmGenerations{} currentGen = append(currentGen, cur) } if len(currentGen) > 0 { groups = append(groups, currentGen) } // Remove any groups in the wrong level var levelGroups []tsmGenerations for _, cur := range groups { if cur.level() == level { levelGroups = append(levelGroups, cur) } } // Determine the minimum number of files required for the level. Higher levels are more // CPU intensive so we only want to include them when we have enough data to make them // worthwhile. // minGenerations 1 -> 2 // minGenerations 2 -> 2 // minGenerations 3 -> 4 // minGenerations 4 -> 4 minGenerations := level if minGenerations%2 != 0 { minGenerations = level + 1 } var cGroups []CompactionGroup for _, group := range levelGroups { for _, chunk := range group.chunk(4) { var cGroup CompactionGroup var hasTombstones bool for _, gen := range chunk { if gen.hasTombstones() { hasTombstones = true } for _, file := range gen.files { cGroup = append(cGroup, file.Path) } } if len(chunk) < minGenerations && !hasTombstones { continue } cGroups = append(cGroups, cGroup) } } if !c.acquire(cGroups) { return nil } return cGroups } // PlanOptimize returns all TSM files if they are in different generations in order // to optimize the index across TSM files. Each returned compaction group can be // compacted concurrently. func (c *DefaultPlanner) PlanOptimize() []CompactionGroup { // Determine the generations from all files on disk. We need to treat // a generation conceptually as a single file even though it may be // split across several files in sequence. generations := c.findGenerations() // If there is only one generation and no tombstones, then there's nothing to // do. if len(generations) <= 1 && !generations.hasTombstones() { return nil } // Group each generation by level such that two adjacent generations in the same // level become part of the same group. var currentGen tsmGenerations var groups []tsmGenerations for i := 0; i < len(generations); i++ { cur := generations[i] if len(currentGen) == 0 || currentGen.level() == cur.level() { currentGen = append(currentGen, cur) continue } groups = append(groups, currentGen) currentGen = tsmGenerations{} currentGen = append(currentGen, cur) } if len(currentGen) > 0 { groups = append(groups, currentGen) } // Only optimize level 4 files since using lower-levels will collide // with the level planners var levelGroups []tsmGenerations for _, cur := range groups { if cur.level() == 4 { levelGroups = append(levelGroups, cur) } } var cGroups []CompactionGroup for _, group := range levelGroups { // Skip the group if it's not worthwhile to optimize it if len(group) < 4 && !group.hasTombstones() { continue } var cGroup CompactionGroup for _, gen := range group { for _, file := range gen.files { cGroup = append(cGroup, file.Path) } } cGroups = append(cGroups, cGroup) } if !c.acquire(cGroups) { return nil } return cGroups } // Plan returns a set of TSM files to rewrite for level 4 or higher. The planning returns // multiple groups if possible to allow compactions to run concurrently. func (c *DefaultPlanner) Plan(lastWrite time.Time) []CompactionGroup { generations := c.findGenerations() // first check if we should be doing a full compaction because nothing has been written in a long time if c.compactFullWriteColdDuration > 0 && time.Since(lastWrite) > c.compactFullWriteColdDuration && len(generations) > 1 { var tsmFiles []string var genCount int for i, group := range generations { var skip bool // Skip the file if it's over the max size and contains a full block and it does not have any tombstones if len(generations) > 2 && group.size() > uint64(maxTSMFileSize) && c.FileStore.BlockCount(group.files[0].Path, 1) == tsdb.DefaultMaxPointsPerBlock && !group.hasTombstones() { skip = true } // We need to look at the level of the next file because it may need to be combined with this generation // but won't get picked up on it's own if this generation is skipped. This allows the most recently // created files to get picked up by the full compaction planner and avoids having a few less optimally // compressed files. if i < len(generations)-1 { if generations[i+1].level() <= 3 { skip = false } } if skip { continue } for _, f := range group.files { tsmFiles = append(tsmFiles, f.Path) } genCount += 1 } sort.Strings(tsmFiles) // Make sure we have more than 1 file and more than 1 generation if len(tsmFiles) <= 1 || genCount <= 1 { return nil } group := []CompactionGroup{tsmFiles} if !c.acquire(group) { return nil } return group } // don't plan if nothing has changed in the filestore if c.lastPlanCheck.After(c.FileStore.LastModified()) && !generations.hasTombstones() { return nil } c.lastPlanCheck = time.Now() // If there is only one generation, return early to avoid re-compacting the same file // over and over again. if len(generations) <= 1 && !generations.hasTombstones() { return nil } // Need to find the ending point for level 4 files. They will be the oldest files. We scan // each generation in descending break once we see a file less than 4. end := 0 start := 0 for i, g := range generations { if g.level() <= 3 { break } end = i + 1 } // As compactions run, the oldest files get bigger. We don't want to re-compact them during // this planning if they are maxed out so skip over any we see. var hasTombstones bool for i, g := range generations[:end] { if g.hasTombstones() { hasTombstones = true } if hasTombstones { continue } // Skip the file if it's over the max size and contains a full block or the generation is split // over multiple files. In the latter case, that would mean the data in the file spilled over // the 2GB limit. if g.size() > uint64(maxTSMFileSize) && c.FileStore.BlockCount(g.files[0].Path, 1) == tsdb.DefaultMaxPointsPerBlock || g.count() > 1 { start = i + 1 } // This is an edge case that can happen after multiple compactions run. The files at the beginning // can become larger faster than ones after them. We want to skip those really big ones and just // compact the smaller ones until they are closer in size. if i > 0 { if g.size()*2 < generations[i-1].size() { start = i break } } } // step is how may files to compact in a group. We want to clamp it at 4 but also stil // return groups smaller than 4. step := 4 if step > end { step = end } // slice off the generations that we'll examine generations = generations[start:end] // Loop through the generations in groups of size step and see if we can compact all (or // some of them as group) groups := []tsmGenerations{} for i := 0; i < len(generations); i += step { var skipGroup bool startIndex := i for j := i; j < i+step && j < len(generations); j++ { gen := generations[j] lvl := gen.level() // Skip compacting this group if there happens to be any lower level files in the // middle. These will get picked up by the level compactors. if lvl <= 3 { skipGroup = true break } // Skip the file if it's over the max size and it contains a full block if gen.size() >= uint64(maxTSMFileSize) && c.FileStore.BlockCount(gen.files[0].Path, 1) == tsdb.DefaultMaxPointsPerBlock && !gen.hasTombstones() { startIndex++ continue } } if skipGroup { continue } endIndex := i + step if endIndex > len(generations) { endIndex = len(generations) } if endIndex-startIndex > 0 { groups = append(groups, generations[startIndex:endIndex]) } } if len(groups) == 0 { return nil } // With the groups, we need to evaluate whether the group as a whole can be compacted compactable := []tsmGenerations{} for _, group := range groups { //if we don't have enough generations to compact, skip it if len(group) < 2 && !group.hasTombstones() { continue } compactable = append(compactable, group) } // All the files to be compacted must be compacted in order. We need to convert each // group to the actual set of files in that group to be compacted. var tsmFiles []CompactionGroup for _, c := range compactable { var cGroup CompactionGroup for _, group := range c { for _, f := range group.files { cGroup = append(cGroup, f.Path) } } sort.Strings(cGroup) tsmFiles = append(tsmFiles, cGroup) } if !c.acquire(tsmFiles) { return nil } return tsmFiles } // findGenerations groups all the TSM files by generation based // on their filename, then returns the generations in descending order (newest first). func (c *DefaultPlanner) findGenerations() tsmGenerations { c.mu.Lock() defer c.mu.Unlock() last := c.lastFindGenerations lastGen := c.lastGenerations if !last.IsZero() && c.FileStore.LastModified().Equal(last) { return lastGen } genTime := c.FileStore.LastModified() tsmStats := c.FileStore.Stats() generations := make(map[int]*tsmGeneration, len(tsmStats)) for _, f := range tsmStats { gen, _, _ := ParseTSMFileName(f.Path) group := generations[gen] if group == nil { group = &tsmGeneration{ id: gen, } generations[gen] = group } group.files = append(group.files, f) } orderedGenerations := make(tsmGenerations, 0, len(generations)) for _, g := range generations { orderedGenerations = append(orderedGenerations, g) } if !orderedGenerations.IsSorted() { sort.Sort(orderedGenerations) } c.lastFindGenerations = genTime c.lastGenerations = orderedGenerations return orderedGenerations } func (c *DefaultPlanner) acquire(groups []CompactionGroup) bool { c.mu.Lock() defer c.mu.Unlock() // See if the new files are already in use for _, g := range groups { for _, f := range g { if _, ok := c.filesInUse[f]; ok { return false } } } // Mark all the new files in use for _, g := range groups { for _, f := range g { c.filesInUse[f] = struct{}{} } } return true } // Release removes the files reference in each compaction group allowing new plans // to be able to use them. func (c *DefaultPlanner) Release(groups []CompactionGroup) { c.mu.Lock() defer c.mu.Unlock() for _, g := range groups { for _, f := range g { delete(c.filesInUse, f) } } } // Compactor merges multiple TSM files into new files or // writes a Cache into 1 or more TSM files. type Compactor struct { Dir string Size int FileStore interface { NextGeneration() int } mu sync.RWMutex snapshotsEnabled bool compactionsEnabled bool // The channel to signal that any in progress snapshots should be aborted. snapshotsInterrupt chan struct{} // The channel to signal that any in progress level compactions should be aborted. compactionsInterrupt chan struct{} files map[string]struct{} } // Open initializes the Compactor. func (c *Compactor) Open() { c.mu.Lock() defer c.mu.Unlock() if c.snapshotsEnabled || c.compactionsEnabled { return } c.snapshotsEnabled = true c.compactionsEnabled = true c.snapshotsInterrupt = make(chan struct{}) c.compactionsInterrupt = make(chan struct{}) c.files = make(map[string]struct{}) } // Close disables the Compactor. func (c *Compactor) Close() { c.mu.Lock() defer c.mu.Unlock() if !(c.snapshotsEnabled || c.compactionsEnabled) { return } c.snapshotsEnabled = false c.compactionsEnabled = false if c.compactionsInterrupt != nil { close(c.compactionsInterrupt) } if c.snapshotsInterrupt != nil { close(c.snapshotsInterrupt) } } // DisableSnapshots disables the compactor from performing snapshots. func (c *Compactor) DisableSnapshots() { c.mu.Lock() c.snapshotsEnabled = false if c.snapshotsInterrupt != nil { close(c.snapshotsInterrupt) c.snapshotsInterrupt = nil } c.mu.Unlock() } // EnableSnapshots allows the compactor to perform snapshots. func (c *Compactor) EnableSnapshots() { c.mu.Lock() c.snapshotsEnabled = true if c.snapshotsInterrupt == nil { c.snapshotsInterrupt = make(chan struct{}) } c.mu.Unlock() } // DisableSnapshots disables the compactor from performing compactions. func (c *Compactor) DisableCompactions() { c.mu.Lock() c.compactionsEnabled = false if c.compactionsInterrupt != nil { close(c.compactionsInterrupt) c.compactionsInterrupt = nil } c.mu.Unlock() } // EnableCompactions allows the compactor to perform compactions. func (c *Compactor) EnableCompactions() { c.mu.Lock() c.compactionsEnabled = true if c.compactionsInterrupt == nil { c.compactionsInterrupt = make(chan struct{}) } c.mu.Unlock() } // WriteSnapshot writes a Cache snapshot to one or more new TSM files. func (c *Compactor) WriteSnapshot(cache *Cache) ([]string, error) { c.mu.RLock() enabled := c.snapshotsEnabled intC := c.snapshotsInterrupt c.mu.RUnlock() if !enabled { return nil, errSnapshotsDisabled } iter := NewCacheKeyIterator(cache, tsdb.DefaultMaxPointsPerBlock, intC) files, err := c.writeNewFiles(c.FileStore.NextGeneration(), 0, iter) // See if we were disabled while writing a snapshot c.mu.RLock() enabled = c.snapshotsEnabled c.mu.RUnlock() if !enabled { return nil, errSnapshotsDisabled } return files, err } // compact writes multiple smaller TSM files into 1 or more larger files. func (c *Compactor) compact(fast bool, tsmFiles []string) ([]string, error) { size := c.Size if size <= 0 { size = tsdb.DefaultMaxPointsPerBlock } // The new compacted files need to added to the max generation in the // set. We need to find that max generation as well as the max sequence // number to ensure we write to the next unique location. var maxGeneration, maxSequence int for _, f := range tsmFiles { gen, seq, err := ParseTSMFileName(f) if err != nil { return nil, err } if gen > maxGeneration { maxGeneration = gen maxSequence = seq } if gen == maxGeneration && seq > maxSequence { maxSequence = seq } } // For each TSM file, create a TSM reader var trs []*TSMReader for _, file := range tsmFiles { f, err := os.Open(file) if err != nil { return nil, err } tr, err := NewTSMReader(f) if err != nil { return nil, err } defer tr.Close() trs = append(trs, tr) } if len(trs) == 0 { return nil, nil } c.mu.RLock() intC := c.compactionsInterrupt c.mu.RUnlock() tsm, err := NewTSMKeyIterator(size, fast, intC, trs...) if err != nil { return nil, err } return c.writeNewFiles(maxGeneration, maxSequence, tsm) } // CompactFull writes multiple smaller TSM files into 1 or more larger files. func (c *Compactor) CompactFull(tsmFiles []string) ([]string, error) { c.mu.RLock() enabled := c.compactionsEnabled c.mu.RUnlock() if !enabled { return nil, errCompactionsDisabled } if !c.add(tsmFiles) { return nil, errCompactionInProgress{} } defer c.remove(tsmFiles) files, err := c.compact(false, tsmFiles) // See if we were disabled while writing a snapshot c.mu.RLock() enabled = c.compactionsEnabled c.mu.RUnlock() if !enabled { if err := c.removeTmpFiles(files); err != nil { return nil, err } return nil, errCompactionsDisabled } return files, err } // CompactFast writes multiple smaller TSM files into 1 or more larger files. func (c *Compactor) CompactFast(tsmFiles []string) ([]string, error) { c.mu.RLock() enabled := c.compactionsEnabled c.mu.RUnlock() if !enabled { return nil, errCompactionsDisabled } if !c.add(tsmFiles) { return nil, errCompactionInProgress{} } defer c.remove(tsmFiles) files, err := c.compact(true, tsmFiles) // See if we were disabled while writing a snapshot c.mu.RLock() enabled = c.compactionsEnabled c.mu.RUnlock() if !enabled { if err := c.removeTmpFiles(files); err != nil { return nil, err } return nil, errCompactionsDisabled } return files, err } // removeTmpFiles is responsible for cleaning up a compaction that // was started, but then abandoned before the temporary files were dealt with. func (c *Compactor) removeTmpFiles(files []string) error { for _, f := range files { if err := os.Remove(f); err != nil { return fmt.Errorf("error removing temp compaction file: %v", err) } } return nil } // writeNewFiles writes from the iterator into new TSM files, rotating // to a new file once it has reached the max TSM file size. func (c *Compactor) writeNewFiles(generation, sequence int, iter KeyIterator) ([]string, error) { // These are the new TSM files written var files []string for { sequence++ // New TSM files are written to a temp file and renamed when fully completed. fileName := filepath.Join(c.Dir, fmt.Sprintf("%09d-%09d.%s.tmp", generation, sequence, TSMFileExtension)) // Write as much as possible to this file err := c.write(fileName, iter) // We've hit the max file limit and there is more to write. Create a new file // and continue. if err == errMaxFileExceeded || err == ErrMaxBlocksExceeded { files = append(files, fileName) continue } else if err == ErrNoValues { // If the file only contained tombstoned entries, then it would be a 0 length // file that we can drop. if err := os.RemoveAll(fileName); err != nil { return nil, err } break } else if _, ok := err.(errCompactionInProgress); ok { // Don't clean up the file as another compaction is using it. This should not happen as the // planner keeps track of which files are assigned to compaction plans now. return nil, err } else if err != nil { // Remove any tmp files we already completed for _, f := range files { if err := os.RemoveAll(f); err != nil { return nil, err } } // We hit an error and didn't finish the compaction. Remove the temp file and abort. if err := os.RemoveAll(fileName); err != nil { return nil, err } return nil, err } files = append(files, fileName) break } return files, nil } func (c *Compactor) write(path string, iter KeyIterator) (err error) { fd, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_EXCL, 0666) if err != nil { return errCompactionInProgress{err: err} } // Create the write for the new TSM file. w, err := NewTSMWriter(fd) if err != nil { return err } defer func() { closeErr := w.Close() if err == nil { err = closeErr } }() for iter.Next() { c.mu.RLock() enabled := c.snapshotsEnabled || c.compactionsEnabled c.mu.RUnlock() if !enabled { return errCompactionAborted } // Each call to read returns the next sorted key (or the prior one if there are // more values to write). The size of values will be less than or equal to our // chunk size (1000) key, minTime, maxTime, block, err := iter.Read() if err != nil { return err } // Write the key and value if err := w.WriteBlock(key, minTime, maxTime, block); err == ErrMaxBlocksExceeded { if err := w.WriteIndex(); err != nil { return err } return err } else if err != nil { return err } // If we have a max file size configured and we're over it, close out the file // and return the error. if w.Size() > maxTSMFileSize { if err := w.WriteIndex(); err != nil { return err } return errMaxFileExceeded } } // We're all done. Close out the file. if err := w.WriteIndex(); err != nil { return err } return nil } func (c *Compactor) add(files []string) bool { c.mu.Lock() defer c.mu.Unlock() // See if the new files are already in use for _, f := range files { if _, ok := c.files[f]; ok { return false } } // Mark all the new files in use for _, f := range files { c.files[f] = struct{}{} } return true } func (c *Compactor) remove(files []string) { c.mu.Lock() defer c.mu.Unlock() for _, f := range files { delete(c.files, f) } } // KeyIterator allows iteration over set of keys and values in sorted order. type KeyIterator interface { // Next returns true if there are any values remaining in the iterator. Next() bool // Read returns the key, time range, and raw data for the next block, // or any error that occurred. Read() (key string, minTime int64, maxTime int64, data []byte, err error) // Close closes the iterator. Close() error } // tsmKeyIterator implements the KeyIterator for set of TSMReaders. Iteration produces // keys in sorted order and the values between the keys sorted and deduped. If any of // the readers have associated tombstone entries, they are returned as part of iteration. type tsmKeyIterator struct { // readers is the set of readers it produce a sorted key run with readers []*TSMReader // values is the temporary buffers for each key that is returned by a reader values map[string][]Value // pos is the current key postion within the corresponding readers slice. A value of // pos[0] = 1, means the reader[0] is currently at key 1 in its ordered index. pos []int // err is any error we received while iterating values. err error // indicates whether the iterator should choose a faster merging strategy over a more // optimally compressed one. If fast is true, multiple blocks will just be added as is // and not combined. In some cases, a slower path will need to be utilized even when // fast is true to prevent overlapping blocks of time for the same key. // If false, the blocks will be decoded and duplicated (if needed) and // then chunked into the maximally sized blocks. fast bool // size is the maximum number of values to encode in a single block size int // key is the current key lowest key across all readers that has not be fully exhausted // of values. key string typ byte iterators []*BlockIterator blocks blocks buf []blocks // mergeValues are decoded blocks that have been combined mergedFloatValues FloatValues mergedIntegerValues IntegerValues mergedBooleanValues BooleanValues mergedStringValues StringValues // merged are encoded blocks that have been combined or used as is // without decode merged blocks interrupt chan struct{} } type block struct { key string minTime, maxTime int64 typ byte b []byte tombstones []TimeRange // readMin, readMax are the timestamps range of values have been // read and encoded from this block. readMin, readMax int64 } func (b *block) overlapsTimeRange(min, max int64) bool { return b.minTime <= max && b.maxTime >= min } func (b *block) read() bool { return b.readMin <= b.minTime && b.readMax >= b.maxTime } func (b *block) markRead(min, max int64) { if min < b.readMin { b.readMin = min } if max > b.readMax { b.readMax = max } } func (b *block) partiallyRead() bool { return b.readMin != b.minTime || b.readMax != b.maxTime } type blocks []*block func (a blocks) Len() int { return len(a) } func (a blocks) Less(i, j int) bool { if a[i].key == a[j].key { return a[i].minTime < a[j].minTime } return a[i].key < a[j].key } func (a blocks) Swap(i, j int) { a[i], a[j] = a[j], a[i] } // NewTSMKeyIterator returns a new TSM key iterator from readers. // size indicates the maximum number of values to encode in a single block. func NewTSMKeyIterator(size int, fast bool, interrupt chan struct{}, readers ...*TSMReader) (KeyIterator, error) { var iter []*BlockIterator for _, r := range readers { iter = append(iter, r.BlockIterator()) } return &tsmKeyIterator{ readers: readers, values: map[string][]Value{}, pos: make([]int, len(readers)), size: size, iterators: iter, fast: fast, buf: make([]blocks, len(iter)), interrupt: interrupt, }, nil } func (k *tsmKeyIterator) hasMergedValues() bool { return len(k.mergedFloatValues) > 0 || len(k.mergedIntegerValues) > 0 || len(k.mergedStringValues) > 0 || len(k.mergedBooleanValues) > 0 } // Next returns true if there are any values remaining in the iterator. func (k *tsmKeyIterator) Next() bool { // Any merged blocks pending? if len(k.merged) > 0 { k.merged = k.merged[1:] if len(k.merged) > 0 { return true } } // Any merged values pending? if k.hasMergedValues() { k.merge() if len(k.merged) > 0 || k.hasMergedValues() { return true } } // If we still have blocks from the last read, merge them if len(k.blocks) > 0 { k.merge() if len(k.merged) > 0 || k.hasMergedValues() { return true } } // Read the next block from each TSM iterator for i, v := range k.buf { if v == nil { iter := k.iterators[i] if iter.Next() { key, minTime, maxTime, typ, _, b, err := iter.Read() if err != nil { k.err = err } // This block may have ranges of time removed from it that would // reduce the block min and max time. tombstones := iter.r.TombstoneRange(key) k.buf[i] = append(k.buf[i], &block{ minTime: minTime, maxTime: maxTime, key: key, typ: typ, b: b, tombstones: tombstones, readMin: math.MaxInt64, readMax: math.MinInt64, }) blockKey := key for iter.PeekNext() == blockKey { iter.Next() key, minTime, maxTime, typ, _, b, err := iter.Read() if err != nil { k.err = err } tombstones := iter.r.TombstoneRange(key) k.buf[i] = append(k.buf[i], &block{ minTime: minTime, maxTime: maxTime, key: key, typ: typ, b: b, tombstones: tombstones, readMin: math.MaxInt64, readMax: math.MinInt64, }) } } } } // Each reader could have a different key that it's currently at, need to find // the next smallest one to keep the sort ordering. var minKey string var minType byte for _, b := range k.buf { // block could be nil if the iterator has been exhausted for that file if len(b) == 0 { continue } if minKey == "" || b[0].key < minKey { minKey = b[0].key minType = b[0].typ } } k.key = minKey k.typ = minType // Now we need to find all blocks that match the min key so we can combine and dedupe // the blocks if necessary for i, b := range k.buf { if len(b) == 0 { continue } if b[0].key == k.key { k.blocks = append(k.blocks, b...) k.buf[i] = nil } } if len(k.blocks) == 0 { return false } k.merge() return len(k.merged) > 0 } // merge combines the next set of blocks into merged blocks. func (k *tsmKeyIterator) merge() { switch k.typ { case BlockFloat64: k.mergeFloat() case BlockInteger: k.mergeInteger() case BlockBoolean: k.mergeBoolean() case BlockString: k.mergeString() default: k.err = fmt.Errorf("unknown block type: %v", k.typ) } } func (k *tsmKeyIterator) Read() (string, int64, int64, []byte, error) { // See if compactions were disabled while we were running. select { case <-k.interrupt: return "", 0, 0, nil, errCompactionAborted default: } if len(k.merged) == 0 { return "", 0, 0, nil, k.err } block := k.merged[0] return block.key, block.minTime, block.maxTime, block.b, k.err } func (k *tsmKeyIterator) Close() error { k.values = nil k.pos = nil k.iterators = nil for _, r := range k.readers { if err := r.Close(); err != nil { return err } } return nil } type cacheKeyIterator struct { cache *Cache size int order []string i int blocks [][]cacheBlock ready []chan struct{} interrupt chan struct{} } type cacheBlock struct { k string minTime, maxTime int64 b []byte err error } // NewCacheKeyIterator returns a new KeyIterator from a Cache. func NewCacheKeyIterator(cache *Cache, size int, interrupt chan struct{}) KeyIterator { keys := cache.Keys() chans := make([]chan struct{}, len(keys)) for i := 0; i < len(keys); i++ { chans[i] = make(chan struct{}, 1) } cki := &cacheKeyIterator{ i: -1, size: size, cache: cache, order: keys, ready: chans, blocks: make([][]cacheBlock, len(keys)), interrupt: interrupt, } go cki.encode() return cki } func (c *cacheKeyIterator) encode() { concurrency := runtime.GOMAXPROCS(0) n := len(c.ready) // Divide the keyset across each CPU chunkSize := 128 idx := uint64(0) for i := 0; i < concurrency; i++ { // Run one goroutine per CPU and encode a section of the key space concurrently go func() { for { start := int(atomic.AddUint64(&idx, uint64(chunkSize))) - chunkSize if start >= n { break } end := start + chunkSize if end > n { end = n } c.encodeRange(start, end) } }() } } func (c *cacheKeyIterator) encodeRange(start, stop int) { for i := start; i < stop; i++ { key := c.order[i] values := c.cache.values(key) for len(values) > 0 { minTime, maxTime := values[0].UnixNano(), values[len(values)-1].UnixNano() var b []byte var err error if len(values) > c.size { maxTime = values[c.size-1].UnixNano() b, err = Values(values[:c.size]).Encode(nil) values = values[c.size:] } else { b, err = Values(values).Encode(nil) values = values[:0] } c.blocks[i] = append(c.blocks[i], cacheBlock{ k: key, minTime: minTime, maxTime: maxTime, b: b, err: err, }) } // Notify this key is fully encoded c.ready[i] <- struct{}{} } } func (c *cacheKeyIterator) Next() bool { if c.i >= 0 && c.i < len(c.ready) && len(c.blocks[c.i]) > 0 { c.blocks[c.i] = c.blocks[c.i][1:] if len(c.blocks[c.i]) > 0 { return true } } c.i++ if c.i >= len(c.ready) { return false } <-c.ready[c.i] return true } func (c *cacheKeyIterator) Read() (string, int64, int64, []byte, error) { // See if snapshot compactions were disabled while we were running. select { case <-c.interrupt: return "", 0, 0, nil, errCompactionAborted default: } blk := c.blocks[c.i][0] return blk.k, blk.minTime, blk.maxTime, blk.b, blk.err } func (c *cacheKeyIterator) Close() error { return nil } type tsmGenerations []*tsmGeneration func (a tsmGenerations) Len() int { return len(a) } func (a tsmGenerations) Less(i, j int) bool { return a[i].id < a[j].id } func (a tsmGenerations) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a tsmGenerations) hasTombstones() bool { for _, g := range a { if g.hasTombstones() { return true } } return false } func (a tsmGenerations) level() int { var level int for _, g := range a { lev := g.level() if lev > level { level = lev } } return level } func (a tsmGenerations) chunk(size int) []tsmGenerations { var chunks []tsmGenerations for len(a) > 0 { if len(a) >= size { chunks = append(chunks, a[:size]) a = a[size:] } else { chunks = append(chunks, a) a = a[len(a):] } } return chunks } func (a tsmGenerations) IsSorted() bool { if len(a) == 1 { return true } for i := 1; i < len(a); i++ { if a.Less(i, i-1) { return false } } return true }