diff --git a/aggregator/aggregator.go b/aggregator/aggregator.go deleted file mode 100644 index 8849a451a..000000000 --- a/aggregator/aggregator.go +++ /dev/null @@ -1,3302 +0,0 @@ -/* - Copyright 2022 Erigon contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package aggregator - -import ( - "bufio" - "bytes" - "container/heap" - "context" - "encoding/binary" - "errors" - "fmt" - "hash" - "io" - "io/fs" - "math" - "os" - "path" - "path/filepath" - "regexp" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" - - "github.com/RoaringBitmap/roaring/roaring64" - "github.com/google/btree" - "github.com/ledgerwatch/log/v3" - "github.com/spaolacci/murmur3" - "golang.org/x/crypto/sha3" - "golang.org/x/exp/slices" - - "github.com/ledgerwatch/erigon-lib/common" - "github.com/ledgerwatch/erigon-lib/etl" - - "github.com/ledgerwatch/erigon-lib/commitment" - "github.com/ledgerwatch/erigon-lib/common/length" - "github.com/ledgerwatch/erigon-lib/compress" - "github.com/ledgerwatch/erigon-lib/kv" - "github.com/ledgerwatch/erigon-lib/recsplit" - "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" -) - -// Aggregator of multiple state files to support state reader and state writer -// The convension for the file names are as follows -// State is composed of three types of files: -// 1. Accounts. keys are addresses (20 bytes), values are encoding of accounts -// 2. Contract storage. Keys are concatenation of addresses (20 bytes) and storage locations (32 bytes), values have their leading zeroes removed -// 3. Contract codes. Keys are addresses (20 bytes), values are bycodes -// Within each type, any file can cover an interval of block numbers, for example, `accounts.1-16` represents changes in accounts -// that were effected by the blocks from 1 to 16, inclusively. The second component of the interval will be called "end block" for the file. -// Finally, for each type and interval, there are two files - one with the compressed data (extension `dat`), -// and another with the index (extension `idx`) consisting of the minimal perfect hash table mapping keys to the offsets of corresponding keys -// in the data file -// Aggregator consists (apart from the file it is aggregating) of the 4 parts: -// 1. Persistent table of expiration time for each of the files. Key - name of the file, value - timestamp, at which the file can be removed -// 2. Transient (in-memory) mapping the "end block" of each file to the objects required for accessing the file (compress.Decompressor and resplit.Index) -// 3. Persistent tables (one for accounts, one for contract storage, and one for contract code) summarising all the 1-block state diff files -// that were not yet merged together to form larger files. In these tables, keys are the same as keys in the state diff files, but values are also -// augemented by the number of state diff files this key is present. This number gets decremented every time when a 1-block state diff files is removed -// from the summary table (due to being merged). And when this number gets to 0, the record is deleted from the summary table. -// This number is encoded into first 4 bytes of the value -// 4. Aggregating persistent hash table. Maps state keys to the block numbers for the use in the part 2 (which is not necessarily the block number where -// the item last changed, but it is guaranteed to find correct element in the Transient mapping of part 2 - -type FileType int - -const ( - Account FileType = iota - Storage - Code - Commitment - AccountHistory - StorageHistory - CodeHistory - AccountBitmap - StorageBitmap - CodeBitmap - NumberOfTypes -) - -const ( - FirstType = Account - NumberOfAccountStorageTypes = Code - NumberOfStateTypes = AccountHistory -) - -func (ft FileType) String() string { - switch ft { - case Account: - return "account" - case Storage: - return "storage" - case Code: - return "code" - case Commitment: - return "commitment" - case AccountHistory: - return "ahistory" - case CodeHistory: - return "chistory" - case StorageHistory: - return "shistory" - case AccountBitmap: - return "abitmap" - case CodeBitmap: - return "cbitmap" - case StorageBitmap: - return "sbitmap" - default: - panic(fmt.Sprintf("unknown file type: %d", ft)) - } -} - -func (ft FileType) Table() string { - switch ft { - case Account: - return kv.StateAccounts - case Storage: - return kv.StateStorage - case Code: - return kv.StateCode - case Commitment: - return kv.StateCommitment - default: - panic(fmt.Sprintf("unknown file type: %d", ft)) - } -} - -func ParseFileType(s string) (FileType, bool) { - switch s { - case "account": - return Account, true - case "storage": - return Storage, true - case "code": - return Code, true - case "commitment": - return Commitment, true - case "ahistory": - return AccountHistory, true - case "chistory": - return CodeHistory, true - case "shistory": - return StorageHistory, true - case "abitmap": - return AccountBitmap, true - case "cbitmap": - return CodeBitmap, true - case "sbitmap": - return StorageBitmap, true - default: - return NumberOfTypes, false - } -} - -type Aggregator struct { - files [NumberOfTypes]*btree.BTree - hph commitment.Trie //*commitment.HexPatriciaHashed - archHasher murmur3.Hash128 - keccak hash.Hash - historyChannel chan struct{} - mergeChannel chan struct{} - tracedKeys map[string]struct{} // Set of keys being traced during aggregations - changesBtree *btree.BTree // btree of ChangesItem - historyError chan error - mergeError chan error - aggChannel chan *AggregationTask - aggError chan error - diffDir string // Directory where the state diff files are stored - arches [NumberOfStateTypes][]uint32 // Over-arching hash tables containing the block number of last aggregation - historyWg sync.WaitGroup - aggWg sync.WaitGroup - mergeWg sync.WaitGroup - unwindLimit uint64 // How far the chain may unwind - aggregationStep uint64 // How many items (block, but later perhaps txs or changes) are required to form one state diff file - fileHits uint64 // Counter for state file hit ratio - fileMisses uint64 // Counter for state file hit ratio - fileLocks [NumberOfTypes]sync.RWMutex - commitments bool // Whether to calculate commitments - changesets bool // Whether to generate changesets (off by default) - trace bool // Turns on tracing for specific accounts and locations -} - -type ChangeFile struct { - r *bufio.Reader - rTx *bufio.Reader - w *bufio.Writer - fileTx *os.File - wTx *bufio.Writer - file *os.File - pathTx string - path string - dir string - namebase string - words []byte // Words pending for the next block record, in the same slice - wordOffsets []int // Offsets of words in the `words` slice - step uint64 - txNum uint64 // Currently read transaction number - txRemaining uint64 // Remaining number of bytes to read in the current transaction -} - -func (cf *ChangeFile) closeFile() error { - if len(cf.wordOffsets) > 0 { - return fmt.Errorf("closeFile without finish") - } - if cf.w != nil { - if err := cf.w.Flush(); err != nil { - return err - } - cf.w = nil - } - if cf.file != nil { - if err := cf.file.Close(); err != nil { - return err - } - cf.file = nil - } - if cf.wTx != nil { - if err := cf.wTx.Flush(); err != nil { - return err - } - cf.wTx = nil - } - if cf.fileTx != nil { - if err := cf.fileTx.Close(); err != nil { - return err - } - cf.fileTx = nil - } - return nil -} - -func (cf *ChangeFile) openFile(blockNum uint64, write bool) error { - if len(cf.wordOffsets) > 0 { - return fmt.Errorf("openFile without finish") - } - rem := blockNum % cf.step - startBlock := blockNum - rem - endBlock := startBlock + cf.step - 1 - if cf.w == nil { - cf.path = filepath.Join(cf.dir, fmt.Sprintf("%s.%d-%d.chg", cf.namebase, startBlock, endBlock)) - cf.pathTx = filepath.Join(cf.dir, fmt.Sprintf("%s.%d-%d.ctx", cf.namebase, startBlock, endBlock)) - var err error - if write { - if cf.file, err = os.OpenFile(cf.path, os.O_RDWR|os.O_CREATE, 0755); err != nil { - return err - } - if cf.fileTx, err = os.OpenFile(cf.pathTx, os.O_RDWR|os.O_CREATE, 0755); err != nil { - return err - } - if _, err = cf.file.Seek(0, 2 /* relative to the end of the file */); err != nil { - return err - } - if _, err = cf.fileTx.Seek(0, 2 /* relative to the end of the file */); err != nil { - return err - } - } else { - if cf.file, err = os.Open(cf.path); err != nil { - return err - } - if cf.fileTx, err = os.Open(cf.pathTx); err != nil { - return err - } - } - if write { - cf.w = bufio.NewWriter(cf.file) - cf.wTx = bufio.NewWriter(cf.fileTx) - } - cf.r = bufio.NewReader(cf.file) - cf.rTx = bufio.NewReader(cf.fileTx) - } - return nil -} - -func (cf *ChangeFile) rewind() error { - var err error - if _, err = cf.file.Seek(0, 0); err != nil { - return err - } - cf.r = bufio.NewReader(cf.file) - if _, err = cf.fileTx.Seek(0, 0); err != nil { - return err - } - cf.rTx = bufio.NewReader(cf.fileTx) - return nil -} - -func (cf *ChangeFile) add(word []byte) { - cf.words = append(cf.words, word...) - cf.wordOffsets = append(cf.wordOffsets, len(cf.words)) -} - -func (cf *ChangeFile) finish(txNum uint64) error { - var numBuf [10]byte - // Write out words - lastOffset := 0 - var size uint64 - for _, offset := range cf.wordOffsets { - word := cf.words[lastOffset:offset] - n := binary.PutUvarint(numBuf[:], uint64(len(word))) - if _, err := cf.w.Write(numBuf[:n]); err != nil { - return err - } - if len(word) > 0 { - if _, err := cf.w.Write(word); err != nil { - return err - } - } - size += uint64(n + len(word)) - lastOffset = offset - } - cf.words = cf.words[:0] - cf.wordOffsets = cf.wordOffsets[:0] - n := binary.PutUvarint(numBuf[:], txNum) - if _, err := cf.wTx.Write(numBuf[:n]); err != nil { - return err - } - n = binary.PutUvarint(numBuf[:], size) - if _, err := cf.wTx.Write(numBuf[:n]); err != nil { - return err - } - return nil -} - -// prevTx positions the reader to the beginning -// of the transaction -func (cf *ChangeFile) nextTx() (bool, error) { - var err error - if cf.txNum, err = binary.ReadUvarint(cf.rTx); err != nil { - if errors.Is(err, io.EOF) { - return false, nil - } - return false, err - } - if cf.txRemaining, err = binary.ReadUvarint(cf.rTx); err != nil { - return false, err - } - return true, nil -} - -func (cf *ChangeFile) nextWord(wordBuf []byte) ([]byte, bool, error) { - if cf.txRemaining == 0 { - return wordBuf, false, nil - } - ws, err := binary.ReadUvarint(cf.r) - if err != nil { - return wordBuf, false, fmt.Errorf("word size: %w", err) - } - var buf []byte - if total := len(wordBuf) + int(ws); cap(wordBuf) >= total { - buf = wordBuf[:total] // Reuse the space in wordBuf, is it has enough capacity - } else { - buf = make([]byte, total) - copy(buf, wordBuf) - } - if _, err = io.ReadFull(cf.r, buf[len(wordBuf):]); err != nil { - return wordBuf, false, fmt.Errorf("read word (%d %d): %w", ws, len(buf[len(wordBuf):]), err) - } - var numBuf [10]byte - n := binary.PutUvarint(numBuf[:], ws) - cf.txRemaining -= uint64(n) + ws - return buf, true, nil -} - -func (cf *ChangeFile) deleteFile() error { - if err := os.Remove(cf.path); err != nil { - return err - } - if err := os.Remove(cf.pathTx); err != nil { - return err - } - return nil -} - -type Changes struct { - namebase string - dir string - keys ChangeFile - before ChangeFile - after ChangeFile - step uint64 - beforeOn bool -} - -func (c *Changes) Init(namebase string, step uint64, dir string, beforeOn bool) { - c.namebase = namebase - c.step = step - c.dir = dir - c.keys.namebase = namebase + ".keys" - c.keys.dir = dir - c.keys.step = step - c.before.namebase = namebase + ".before" - c.before.dir = dir - c.before.step = step - c.after.namebase = namebase + ".after" - c.after.dir = dir - c.after.step = step - c.beforeOn = beforeOn -} - -func (c *Changes) closeFiles() error { - if err := c.keys.closeFile(); err != nil { - return err - } - if c.beforeOn { - if err := c.before.closeFile(); err != nil { - return err - } - } - if err := c.after.closeFile(); err != nil { - return err - } - return nil -} - -func (c *Changes) openFiles(blockNum uint64, write bool) error { - if err := c.keys.openFile(blockNum, write); err != nil { - return err - } - if c.beforeOn { - if err := c.before.openFile(blockNum, write); err != nil { - return err - } - } - if err := c.after.openFile(blockNum, write); err != nil { - return err - } - return nil -} - -func (c *Changes) insert(key, after []byte) { - c.keys.add(key) - if c.beforeOn { - c.before.add(nil) - } - c.after.add(after) -} - -func (c *Changes) update(key, before, after []byte) { - c.keys.add(key) - if c.beforeOn { - c.before.add(before) - } - c.after.add(after) -} - -func (c *Changes) delete(key, before []byte) { - c.keys.add(key) - if c.beforeOn { - c.before.add(before) - } - c.after.add(nil) -} - -func (c *Changes) finish(txNum uint64) error { - if err := c.keys.finish(txNum); err != nil { - return err - } - if c.beforeOn { - if err := c.before.finish(txNum); err != nil { - return err - } - } - if err := c.after.finish(txNum); err != nil { - return err - } - return nil -} - -func (c *Changes) nextTx() (bool, uint64, error) { - bkeys, err := c.keys.nextTx() - if err != nil { - return false, 0, err - } - var bbefore, bafter bool - if c.beforeOn { - if bbefore, err = c.before.nextTx(); err != nil { - return false, 0, err - } - } - if bafter, err = c.after.nextTx(); err != nil { - return false, 0, err - } - if c.beforeOn && bkeys != bbefore { - return false, 0, fmt.Errorf("inconsistent tx iteration") - } - if bkeys != bafter { - return false, 0, fmt.Errorf("inconsistent tx iteration") - } - txNum := c.keys.txNum - if c.beforeOn { - if txNum != c.before.txNum { - return false, 0, fmt.Errorf("inconsistent txNum, keys: %d, before: %d", txNum, c.before.txNum) - } - } - if txNum != c.after.txNum { - return false, 0, fmt.Errorf("inconsistent txNum, keys: %d, after: %d", txNum, c.after.txNum) - } - return bkeys, txNum, nil -} - -func (c *Changes) rewind() error { - if err := c.keys.rewind(); err != nil { - return err - } - if c.beforeOn { - if err := c.before.rewind(); err != nil { - return err - } - } - if err := c.after.rewind(); err != nil { - return err - } - return nil -} - -func (c *Changes) nextTriple(keyBuf, beforeBuf, afterBuf []byte) ([]byte, []byte, []byte, bool, error) { - key, bkeys, err := c.keys.nextWord(keyBuf) - if err != nil { - return keyBuf, beforeBuf, afterBuf, false, fmt.Errorf("next key: %w", err) - } - var before, after []byte - var bbefore, bafter bool - if c.beforeOn { - if before, bbefore, err = c.before.nextWord(beforeBuf); err != nil { - return keyBuf, beforeBuf, afterBuf, false, fmt.Errorf("next before: %w", err) - } - } - if c.beforeOn && bkeys != bbefore { - return keyBuf, beforeBuf, afterBuf, false, fmt.Errorf("inconsistent word iteration") - } - if after, bafter, err = c.after.nextWord(afterBuf); err != nil { - return keyBuf, beforeBuf, afterBuf, false, fmt.Errorf("next after: %w", err) - } - if bkeys != bafter { - return keyBuf, beforeBuf, afterBuf, false, fmt.Errorf("inconsistent word iteration") - } - return key, before, after, bkeys, nil -} - -func (c *Changes) deleteFiles() error { - if err := c.keys.deleteFile(); err != nil { - return err - } - if c.beforeOn { - if err := c.before.deleteFile(); err != nil { - return err - } - } - if err := c.after.deleteFile(); err != nil { - return err - } - return nil -} - -func buildIndex(d *compress.Decompressor, idxPath, tmpDir string, count int) (*recsplit.Index, error) { - var rs *recsplit.RecSplit - var err error - if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: count, - Enums: false, - BucketSize: 2000, - LeafSize: 8, - TmpDir: tmpDir, - IndexFile: idxPath, - EtlBufLimit: etl.BufferOptimalSize / 2, - }); err != nil { - return nil, err - } - defer rs.Close() - rs.LogLvl(log.LvlDebug) - - word := make([]byte, 0, 256) - var pos uint64 - g := d.MakeGetter() - for { - g.Reset(0) - for g.HasNext() { - word, _ = g.Next(word[:0]) - if err = rs.AddKey(word, pos); err != nil { - return nil, err - } - // Skip value - pos = g.Skip() - } - if err = rs.Build(); err != nil { - if rs.Collision() { - log.Info("Building recsplit. Collision happened. It's ok. Restarting...") - rs.ResetNextSalt() - } else { - return nil, err - } - } else { - break - } - } - var idx *recsplit.Index - if idx, err = recsplit.OpenIndex(idxPath); err != nil { - return nil, err - } - return idx, nil -} - -// aggregate gathers changes from the changefiles into a B-tree, and "removes" them from the database -// This function is time-critical because it needs to be run in the same go-routine (thread) as the general -// execution (due to read-write tx). After that, we can optimistically execute the rest in the background -func (c *Changes) aggregate(blockFrom, blockTo uint64, prefixLen int, tx kv.RwTx, table string, commitMerger commitmentMerger) (*btree.BTreeG[*AggregateItem], error) { - if err := c.openFiles(blockTo, false /* write */); err != nil { - return nil, fmt.Errorf("open files: %w", err) - } - bt := btree.NewG[*AggregateItem](32, AggregateItemLess) - err := c.aggregateToBtree(bt, prefixLen, commitMerger) - if err != nil { - return nil, fmt.Errorf("aggregateToBtree: %w", err) - } - // Clean up the DB table - var e error - bt.Ascend(func(item *AggregateItem) bool { - if item.count == 0 { - return true - } - dbPrefix := item.k - prevV, err := tx.GetOne(table, dbPrefix) - if err != nil { - e = err - return false - } - if prevV == nil { - e = fmt.Errorf("record not found in db for %s key %x", table, dbPrefix) - return false - } - - prevNum := binary.BigEndian.Uint32(prevV[:4]) - if prevNum < item.count { - e = fmt.Errorf("record count too low for %s key %s count %d, subtracting %d", table, dbPrefix, prevNum, item.count) - return false - } - if prevNum == item.count { - if e = tx.Delete(table, dbPrefix); e != nil { - return false - } - } else { - v := make([]byte, len(prevV)) - binary.BigEndian.PutUint32(v[:4], prevNum-item.count) - copy(v[4:], prevV[4:]) - - if e = tx.Put(table, dbPrefix, v); e != nil { - return false - } - } - return true - }) - if e != nil { - return nil, fmt.Errorf("clean up table %s after aggregation: %w", table, e) - } - return bt, nil -} - -func (a *Aggregator) updateArch(bt *btree.BTreeG[*AggregateItem], fType FileType, blockNum32 uint32) { - arch := a.arches[fType] - h := a.archHasher - n := uint64(len(arch)) - if n == 0 { - return - } - bt.Ascend(func(item *AggregateItem) bool { - if item.count == 0 { - return true - } - h.Reset() - h.Write(item.k) //nolint:errcheck - p, _ := h.Sum128() - p = p % n - v := atomic.LoadUint32(&arch[p]) - if v < blockNum32 { - //fmt.Printf("Updated %s arch [%x]=%d %d\n", fType.String(), item.k, p, blockNum32) - atomic.StoreUint32(&arch[p], blockNum32) - } - return true - }) -} - -type AggregateItem struct { - k, v []byte - count uint32 -} - -func AggregateItemLess(a, than *AggregateItem) bool { return bytes.Compare(a.k, than.k) < 0 } -func (i *AggregateItem) Less(than btree.Item) bool { - return bytes.Compare(i.k, than.(*AggregateItem).k) < 0 -} - -func (c *Changes) produceChangeSets(blockFrom, blockTo uint64, historyType, bitmapType FileType) (*compress.Decompressor, *recsplit.Index, *compress.Decompressor, *recsplit.Index, error) { - chsetDatPath := filepath.Join(c.dir, fmt.Sprintf("%s.%d-%d.dat", historyType.String(), blockFrom, blockTo)) - chsetIdxPath := filepath.Join(c.dir, fmt.Sprintf("%s.%d-%d.idx", historyType.String(), blockFrom, blockTo)) - bitmapDatPath := filepath.Join(c.dir, fmt.Sprintf("%s.%d-%d.dat", bitmapType.String(), blockFrom, blockTo)) - bitmapIdxPath := filepath.Join(c.dir, fmt.Sprintf("%s.%d-%d.idx", bitmapType.String(), blockFrom, blockTo)) - var blockSuffix [8]byte - binary.BigEndian.PutUint64(blockSuffix[:], blockTo) - bitmaps := map[string]*roaring64.Bitmap{} - comp, err := compress.NewCompressor(context.Background(), AggregatorPrefix, chsetDatPath, c.dir, compress.MinPatternScore, 1, log.LvlDebug) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets NewCompressor: %w", err) - } - defer func() { - if comp != nil { - comp.Close() - } - }() - var totalRecords int - var b bool - var e error - var txNum uint64 - var key, before, after []byte - if err = c.rewind(); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets rewind: %w", err) - } - var txKey = make([]byte, 8, 60) - for b, txNum, e = c.nextTx(); b && e == nil; b, txNum, e = c.nextTx() { - binary.BigEndian.PutUint64(txKey[:8], txNum) - for key, before, after, b, e = c.nextTriple(key[:0], before[:0], after[:0]); b && e == nil; key, before, after, b, e = c.nextTriple(key[:0], before[:0], after[:0]) { - totalRecords++ - txKey = append(txKey[:8], key...) - // In the inital files and most merged file, the txKey is added to the file, but it gets removed in the final merge - if err = comp.AddUncompressedWord(txKey); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets AddWord key: %w", err) - } - if err = comp.AddUncompressedWord(before); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets AddWord before: %w", err) - } - //if historyType == AccountHistory { - // fmt.Printf("produce %s.%d-%d [%x]=>[%x]\n", historyType.String(), blockFrom, blockTo, txKey, before) - //} - var bitmap *roaring64.Bitmap - var ok bool - if bitmap, ok = bitmaps[string(key)]; !ok { - bitmap = roaring64.New() - bitmaps[string(key)] = bitmap - } - bitmap.Add(txNum) - } - if e != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets nextTriple: %w", e) - } - } - if e != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets prevTx: %w", e) - } - if err = comp.Compress(); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets Compress: %w", err) - } - comp.Close() - comp = nil - var d *compress.Decompressor - var index *recsplit.Index - if d, err = compress.NewDecompressor(chsetDatPath); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets changeset decompressor: %w", err) - } - if index, err = buildIndex(d, chsetIdxPath, c.dir, totalRecords); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets changeset buildIndex: %w", err) - } - // Create bitmap files - bitmapC, err := compress.NewCompressor(context.Background(), AggregatorPrefix, bitmapDatPath, c.dir, compress.MinPatternScore, 1, log.LvlDebug) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets bitmap NewCompressor: %w", err) - } - defer func() { - if bitmapC != nil { - bitmapC.Close() - } - }() - idxKeys := make([]string, len(bitmaps)) - i := 0 - var buf []byte - for key := range bitmaps { - idxKeys[i] = key - i++ - } - slices.Sort(idxKeys) - for _, key := range idxKeys { - if err = bitmapC.AddUncompressedWord([]byte(key)); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets bitmap add key: %w", err) - } - bitmap := bitmaps[key] - ef := eliasfano32.NewEliasFano(bitmap.GetCardinality(), bitmap.Maximum()) - it := bitmap.Iterator() - for it.HasNext() { - v := it.Next() - ef.AddOffset(v) - } - ef.Build() - buf = ef.AppendBytes(buf[:0]) - if err = bitmapC.AddUncompressedWord(buf); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets bitmap add val: %w", err) - } - } - if err = bitmapC.Compress(); err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets bitmap Compress: %w", err) - } - bitmapC.Close() - bitmapC = nil - bitmapD, err := compress.NewDecompressor(bitmapDatPath) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets bitmap decompressor: %w", err) - } - - bitmapI, err := buildIndex(bitmapD, bitmapIdxPath, c.dir, len(idxKeys)) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("produceChangeSets bitmap buildIndex: %w", err) - } - return d, index, bitmapD, bitmapI, nil -} - -// aggregateToBtree iterates over all available changes in the change files covered by this instance `c` -// (there are 3 of them, one for "keys", one for values "before" every change, and one for values "after" every change) -// and create a B-tree where each key is only represented once, with the value corresponding to the "after" value -// of the latest change. -func (c *Changes) aggregateToBtree(bt *btree.BTreeG[*AggregateItem], prefixLen int, commitMerge commitmentMerger) error { - var b bool - var e error - var key, before, after []byte - var ai AggregateItem - var prefix []byte - // Note that the following loop iterates over transactions forwards, therefore it replace entries in the B-tree - for b, _, e = c.nextTx(); b && e == nil; b, _, e = c.nextTx() { - // Within each transaction, keys are unique, but they can appear in any order - for key, before, after, b, e = c.nextTriple(key[:0], before[:0], after[:0]); b && e == nil; key, before, after, b, e = c.nextTriple(key[:0], before[:0], after[:0]) { - if prefixLen > 0 && !bytes.Equal(prefix, key[:prefixLen]) { - prefix = common.Copy(key[:prefixLen]) - item := &AggregateItem{k: prefix, count: 0} - bt.ReplaceOrInsert(item) - } - - ai.k = key - i, ok := bt.Get(&ai) - if !ok || i == nil { - item := &AggregateItem{k: common.Copy(key), v: common.Copy(after), count: 1} - bt.ReplaceOrInsert(item) - continue - } - - item := i - if commitMerge != nil { - mergedVal, err := commitMerge(item.v, after, nil) - if err != nil { - return fmt.Errorf("merge branches (%T) : %w", commitMerge, err) - } - //fmt.Printf("aggregateToBtree prefix [%x], [%x]+[%x]=>[%x]\n", commitment.CompactToHex(key), after, item.v, mergedVal) - item.v = mergedVal - } else { - item.v = common.Copy(after) - } - item.count++ - } - if e != nil { - return fmt.Errorf("aggregateToBtree nextTriple: %w", e) - } - } - if e != nil { - return fmt.Errorf("aggregateToBtree prevTx: %w", e) - } - return nil -} - -const AggregatorPrefix = "aggregator" - -func btreeToFile(bt *btree.BTreeG[*AggregateItem], datPath, tmpdir string, trace bool, workers int) (int, error) { - comp, err := compress.NewCompressor(context.Background(), AggregatorPrefix, datPath, tmpdir, compress.MinPatternScore, workers, log.LvlDebug) - if err != nil { - return 0, err - } - defer comp.Close() - comp.SetTrace(trace) - count := 0 - bt.Ascend(func(item *AggregateItem) bool { - //fmt.Printf("btreeToFile %s [%x]=>[%x]\n", datPath, item.k, item.v) - if err = comp.AddUncompressedWord(item.k); err != nil { - return false - } - count++ // Only counting keys, not values - if err = comp.AddUncompressedWord(item.v); err != nil { - return false - } - return true - }) - if err != nil { - return 0, err - } - if err = comp.Compress(); err != nil { - return 0, err - } - return count, nil -} - -type ChangesItem struct { - endBlock uint64 - startBlock uint64 - fileCount int -} - -func (i *ChangesItem) Less(than btree.Item) bool { - if i.endBlock == than.(*ChangesItem).endBlock { - // Larger intevals will come last - return i.startBlock > than.(*ChangesItem).startBlock - } - return i.endBlock < than.(*ChangesItem).endBlock -} - -type byEndBlockItem struct { - decompressor *compress.Decompressor - getter *compress.Getter // reader for the decompressor - getterMerge *compress.Getter // reader for the decompressor used in the background merge thread - index *recsplit.Index - indexReader *recsplit.IndexReader // reader for the index - readerMerge *recsplit.IndexReader // index reader for the background merge thread - tree *btree.BTreeG[*AggregateItem] // Substitute for decompressor+index combination - startBlock uint64 - endBlock uint64 -} - -func ByEndBlockItemLess(i, than *byEndBlockItem) bool { - if i.endBlock == than.endBlock { - return i.startBlock > than.startBlock - } - return i.endBlock < than.endBlock -} - -func (i *byEndBlockItem) Less(than btree.Item) bool { - if i.endBlock == than.(*byEndBlockItem).endBlock { - return i.startBlock > than.(*byEndBlockItem).startBlock - } - return i.endBlock < than.(*byEndBlockItem).endBlock -} - -func (a *Aggregator) scanStateFiles(files []fs.DirEntry) { - typeStrings := make([]string, NumberOfTypes) - for fType := FileType(0); fType < NumberOfTypes; fType++ { - typeStrings[fType] = fType.String() - } - re := regexp.MustCompile("^(" + strings.Join(typeStrings, "|") + ").([0-9]+)-([0-9]+).(dat|idx)$") - var err error - for _, f := range files { - name := f.Name() - subs := re.FindStringSubmatch(name) - if len(subs) != 5 { - if len(subs) != 0 { - log.Warn("File ignored by aggregator, more than 4 submatches", "name", name, "submatches", len(subs)) - } - continue - } - var startBlock, endBlock uint64 - if startBlock, err = strconv.ParseUint(subs[2], 10, 64); err != nil { - log.Warn("File ignored by aggregator, parsing startBlock", "error", err, "name", name) - continue - } - if endBlock, err = strconv.ParseUint(subs[3], 10, 64); err != nil { - log.Warn("File ignored by aggregator, parsing endBlock", "error", err, "name", name) - continue - } - if startBlock > endBlock { - log.Warn("File ignored by aggregator, startBlock > endBlock", "name", name) - continue - } - fType, ok := ParseFileType(subs[1]) - if !ok { - log.Warn("File ignored by aggregator, type unknown", "type", subs[1]) - } - var item = &byEndBlockItem{startBlock: startBlock, endBlock: endBlock} - var foundI *byEndBlockItem - a.files[fType].AscendGreaterOrEqual(&byEndBlockItem{startBlock: endBlock, endBlock: endBlock}, func(i btree.Item) bool { - it := i.(*byEndBlockItem) - if it.endBlock == endBlock { - foundI = it - } - return false - }) - if foundI == nil || foundI.startBlock > startBlock { - log.Info("Load state file", "name", name, "type", fType.String(), "startBlock", startBlock, "endBlock", endBlock) - a.files[fType].ReplaceOrInsert(item) - } - } -} - -func NewAggregator(diffDir string, unwindLimit uint64, aggregationStep uint64, changesets, commitments bool, minArch uint64, trie commitment.Trie, tx kv.RwTx) (*Aggregator, error) { - a := &Aggregator{ - diffDir: diffDir, - unwindLimit: unwindLimit, - aggregationStep: aggregationStep, - tracedKeys: map[string]struct{}{}, - keccak: sha3.NewLegacyKeccak256(), - hph: trie, - aggChannel: make(chan *AggregationTask, 1024), - aggError: make(chan error, 1), - mergeChannel: make(chan struct{}, 1), - mergeError: make(chan error, 1), - historyChannel: make(chan struct{}, 1), - historyError: make(chan error, 1), - changesets: changesets, - commitments: commitments, - archHasher: murmur3.New128WithSeed(0), // TODO: Randomise salt - } - for fType := FirstType; fType < NumberOfTypes; fType++ { - a.files[fType] = btree.New(32) - } - var closeStateFiles = true // It will be set to false in case of success at the end of the function - defer func() { - // Clean up all decompressor and indices upon error - if closeStateFiles { - a.Close() - } - }() - // Scan the diff directory and create the mapping of end blocks to files - files, err := os.ReadDir(diffDir) - if err != nil { - return nil, err - } - a.scanStateFiles(files) - // Check for overlaps and holes - for fType := FirstType; fType < NumberOfTypes; fType++ { - if err := checkOverlaps(fType.String(), a.files[fType]); err != nil { - return nil, err - } - } - // Open decompressor and index files for all items in state trees - for fType := FirstType; fType < NumberOfTypes; fType++ { - if err := a.openFiles(fType, minArch); err != nil { - return nil, fmt.Errorf("opening %s state files: %w", fType.String(), err) - } - } - a.changesBtree = btree.New(32) - re := regexp.MustCompile(`^(account|storage|code|commitment).(keys|before|after).([0-9]+)-([0-9]+).chg$`) - for _, f := range files { - name := f.Name() - subs := re.FindStringSubmatch(name) - if len(subs) != 5 { - if len(subs) != 0 { - log.Warn("File ignored by changes scan, more than 4 submatches", "name", name, "submatches", len(subs)) - } - continue - } - var startBlock, endBlock uint64 - if startBlock, err = strconv.ParseUint(subs[3], 10, 64); err != nil { - log.Warn("File ignored by changes scan, parsing startBlock", "error", err, "name", name) - continue - } - if endBlock, err = strconv.ParseUint(subs[4], 10, 64); err != nil { - log.Warn("File ignored by changes scan, parsing endBlock", "error", err, "name", name) - continue - } - if startBlock > endBlock { - log.Warn("File ignored by changes scan, startBlock > endBlock", "name", name) - continue - } - if endBlock != startBlock+aggregationStep-1 { - log.Warn("File ignored by changes scan, endBlock != startBlock+aggregationStep-1", "name", name) - continue - } - var item = &ChangesItem{fileCount: 1, startBlock: startBlock, endBlock: endBlock} - i := a.changesBtree.Get(item) - if i == nil { - a.changesBtree.ReplaceOrInsert(item) - } else { - item = i.(*ChangesItem) - if item.startBlock == startBlock { - item.fileCount++ - } else { - return nil, fmt.Errorf("change files overlap [%d-%d] with [%d-%d]", item.startBlock, item.endBlock, startBlock, endBlock) - } - } - } - // Check for holes in change files - minStart := uint64(math.MaxUint64) - a.changesBtree.Descend(func(i btree.Item) bool { - item := i.(*ChangesItem) - if item.startBlock < minStart { - if item.endBlock >= minStart { - err = fmt.Errorf("overlap of change files [%d-%d] with %d", item.startBlock, item.endBlock, minStart) - return false - } - if minStart != math.MaxUint64 && item.endBlock+1 != minStart { - err = fmt.Errorf("whole in change files [%d-%d]", item.endBlock, minStart) - return false - } - minStart = item.startBlock - } else { - err = fmt.Errorf("overlap of change files [%d-%d] with %d", item.startBlock, item.endBlock, minStart) - return false - } - return true - }) - if err != nil { - return nil, err - } - for fType := FirstType; fType < NumberOfStateTypes; fType++ { - if err = checkOverlapWithMinStart(fType.String(), a.files[fType], minStart); err != nil { - return nil, err - } - } - if err = a.rebuildRecentState(tx); err != nil { - return nil, fmt.Errorf("rebuilding recent state from change files: %w", err) - } - closeStateFiles = false - a.aggWg.Add(1) - go a.backgroundAggregation() - a.mergeWg.Add(1) - go a.backgroundMerge() - if a.changesets { - a.historyWg.Add(1) - go a.backgroundHistoryMerge() - } - return a, nil -} - -// rebuildRecentState reads change files and reconstructs the recent state -func (a *Aggregator) rebuildRecentState(tx kv.RwTx) error { - t := time.Now() - var err error - trees := map[FileType]*btree.BTreeG[*AggregateItem]{} - - a.changesBtree.Ascend(func(i btree.Item) bool { - item := i.(*ChangesItem) - for fType := FirstType; fType < NumberOfStateTypes; fType++ { - tree, ok := trees[fType] - if !ok { - tree = btree.NewG[*AggregateItem](32, AggregateItemLess) - trees[fType] = tree - } - var changes Changes - changes.Init(fType.String(), a.aggregationStep, a.diffDir, false /* beforeOn */) - if err = changes.openFiles(item.startBlock, false /* write */); err != nil { - return false - } - var prefixLen int - if fType == Storage { - prefixLen = length.Addr - } - - var commitMerger commitmentMerger - if fType == Commitment { - commitMerger = mergeCommitments - } - - if err = changes.aggregateToBtree(tree, prefixLen, commitMerger); err != nil { - return false - } - if err = changes.closeFiles(); err != nil { - return false - } - } - return true - }) - if err != nil { - return err - } - for fType, tree := range trees { - table := fType.Table() - tree.Ascend(func(item *AggregateItem) bool { - if len(item.v) == 0 { - return true - } - var v []byte - if v, err = tx.GetOne(table, item.k); err != nil { - return false - } - if item.count != binary.BigEndian.Uint32(v[:4]) { - err = fmt.Errorf("mismatched count for %x: change file %d, db: %d", item.k, item.count, binary.BigEndian.Uint32(v[:4])) - return false - } - if !bytes.Equal(item.v, v[4:]) { - err = fmt.Errorf("mismatched v for %x: change file [%x], db: [%x]", item.k, item.v, v[4:]) - return false - } - return true - }) - } - if err != nil { - return err - } - log.Info("reconstructed recent state", "in", time.Since(t)) - return nil -} - -type AggregationTask struct { - bt [NumberOfStateTypes]*btree.BTreeG[*AggregateItem] - changes [NumberOfStateTypes]Changes - blockFrom uint64 - blockTo uint64 -} - -func (a *Aggregator) removeLocked(fType FileType, toRemove []*byEndBlockItem, item *byEndBlockItem) { - a.fileLocks[fType].Lock() - defer a.fileLocks[fType].Unlock() - if len(toRemove) > 1 { - for _, ag := range toRemove { - a.files[fType].Delete(ag) - } - a.files[fType].ReplaceOrInsert(item) - } -} - -func (a *Aggregator) removeLockedState( - accountsToRemove []*byEndBlockItem, accountsItem *byEndBlockItem, - codeToRemove []*byEndBlockItem, codeItem *byEndBlockItem, - storageToRemove []*byEndBlockItem, storageItem *byEndBlockItem, - commitmentToRemove []*byEndBlockItem, commitmentItem *byEndBlockItem, -) { - for fType := FirstType; fType < NumberOfStateTypes; fType++ { - a.fileLocks[fType].Lock() - defer a.fileLocks[fType].Unlock() - } - if len(accountsToRemove) > 1 { - for _, ag := range accountsToRemove { - a.files[Account].Delete(ag) - } - a.files[Account].ReplaceOrInsert(accountsItem) - } - if len(codeToRemove) > 1 { - for _, ag := range codeToRemove { - a.files[Code].Delete(ag) - } - a.files[Code].ReplaceOrInsert(codeItem) - } - if len(storageToRemove) > 1 { - for _, ag := range storageToRemove { - a.files[Storage].Delete(ag) - } - a.files[Storage].ReplaceOrInsert(storageItem) - } - if len(commitmentToRemove) > 1 { - for _, ag := range commitmentToRemove { - a.files[Commitment].Delete(ag) - } - a.files[Commitment].ReplaceOrInsert(commitmentItem) - } -} - -func removeFiles(fType FileType, diffDir string, toRemove []*byEndBlockItem) error { - // Close all the memory maps etc - for _, ag := range toRemove { - if err := ag.index.Close(); err != nil { - return fmt.Errorf("close index: %w", err) - } - if err := ag.decompressor.Close(); err != nil { - return fmt.Errorf("close decompressor: %w", err) - } - } - // Delete files - // TODO: in a non-test version, this is delayed to allow other participants to roll over to the next file - for _, ag := range toRemove { - if err := os.Remove(path.Join(diffDir, fmt.Sprintf("%s.%d-%d.dat", fType.String(), ag.startBlock, ag.endBlock))); err != nil { - return fmt.Errorf("remove decompressor file %s.%d-%d.dat: %w", fType.String(), ag.startBlock, ag.endBlock, err) - } - if err := os.Remove(path.Join(diffDir, fmt.Sprintf("%s.%d-%d.idx", fType.String(), ag.startBlock, ag.endBlock))); err != nil { - return fmt.Errorf("remove index file %s.%d-%d.idx: %w", fType.String(), ag.startBlock, ag.endBlock, err) - } - } - return nil -} - -// backgroundAggregation is the functin that runs in a background go-routine and performs creation of initial state files -// allowing the main goroutine to proceed -func (a *Aggregator) backgroundAggregation() { - defer a.aggWg.Done() - for aggTask := range a.aggChannel { - if a.changesets { - if historyD, historyI, bitmapD, bitmapI, err := aggTask.changes[Account].produceChangeSets(aggTask.blockFrom, aggTask.blockTo, AccountHistory, AccountBitmap); err == nil { - var historyItem = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - historyItem.decompressor = historyD - historyItem.index = historyI - historyItem.getter = historyItem.decompressor.MakeGetter() - historyItem.getterMerge = historyItem.decompressor.MakeGetter() - historyItem.indexReader = recsplit.NewIndexReader(historyItem.index) - historyItem.readerMerge = recsplit.NewIndexReader(historyItem.index) - a.addLocked(AccountHistory, historyItem) - var bitmapItem = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - bitmapItem.decompressor = bitmapD - bitmapItem.index = bitmapI - bitmapItem.getter = bitmapItem.decompressor.MakeGetter() - bitmapItem.getterMerge = bitmapItem.decompressor.MakeGetter() - bitmapItem.indexReader = recsplit.NewIndexReader(bitmapItem.index) - bitmapItem.readerMerge = recsplit.NewIndexReader(bitmapItem.index) - a.addLocked(AccountBitmap, bitmapItem) - } else { - a.aggError <- fmt.Errorf("produceChangeSets %s: %w", Account.String(), err) - return - } - if historyD, historyI, bitmapD, bitmapI, err := aggTask.changes[Storage].produceChangeSets(aggTask.blockFrom, aggTask.blockTo, StorageHistory, StorageBitmap); err == nil { - var historyItem = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - historyItem.decompressor = historyD - historyItem.index = historyI - historyItem.getter = historyItem.decompressor.MakeGetter() - historyItem.getterMerge = historyItem.decompressor.MakeGetter() - historyItem.indexReader = recsplit.NewIndexReader(historyItem.index) - historyItem.readerMerge = recsplit.NewIndexReader(historyItem.index) - a.addLocked(StorageHistory, historyItem) - var bitmapItem = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - bitmapItem.decompressor = bitmapD - bitmapItem.index = bitmapI - bitmapItem.getter = bitmapItem.decompressor.MakeGetter() - bitmapItem.getterMerge = bitmapItem.decompressor.MakeGetter() - bitmapItem.indexReader = recsplit.NewIndexReader(bitmapItem.index) - bitmapItem.readerMerge = recsplit.NewIndexReader(bitmapItem.index) - a.addLocked(StorageBitmap, bitmapItem) - } else { - a.aggError <- fmt.Errorf("produceChangeSets %s: %w", Storage.String(), err) - return - } - if historyD, historyI, bitmapD, bitmapI, err := aggTask.changes[Code].produceChangeSets(aggTask.blockFrom, aggTask.blockTo, CodeHistory, CodeBitmap); err == nil { - var historyItem = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - historyItem.decompressor = historyD - historyItem.index = historyI - historyItem.getter = historyItem.decompressor.MakeGetter() - historyItem.getterMerge = historyItem.decompressor.MakeGetter() - historyItem.indexReader = recsplit.NewIndexReader(historyItem.index) - historyItem.readerMerge = recsplit.NewIndexReader(historyItem.index) - a.addLocked(CodeHistory, historyItem) - var bitmapItem = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - bitmapItem.decompressor = bitmapD - bitmapItem.index = bitmapI - bitmapItem.getter = bitmapItem.decompressor.MakeGetter() - bitmapItem.getterMerge = bitmapItem.decompressor.MakeGetter() - bitmapItem.indexReader = recsplit.NewIndexReader(bitmapItem.index) - bitmapItem.readerMerge = recsplit.NewIndexReader(bitmapItem.index) - a.addLocked(CodeBitmap, bitmapItem) - } else { - a.aggError <- fmt.Errorf("produceChangeSets %s: %w", Code.String(), err) - return - } - } - typesLimit := Commitment - if a.commitments { - typesLimit = AccountHistory - } - for fType := FirstType; fType < typesLimit; fType++ { - var err error - if err = aggTask.changes[fType].closeFiles(); err != nil { - a.aggError <- fmt.Errorf("close %sChanges: %w", fType.String(), err) - return - } - var item = &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo} - if item.decompressor, item.index, err = createDatAndIndex(fType.String(), a.diffDir, aggTask.bt[fType], aggTask.blockFrom, aggTask.blockTo); err != nil { - a.aggError <- fmt.Errorf("createDatAndIndex %s: %w", fType.String(), err) - return - } - item.getter = item.decompressor.MakeGetter() - item.getterMerge = item.decompressor.MakeGetter() - item.indexReader = recsplit.NewIndexReader(item.index) - item.readerMerge = recsplit.NewIndexReader(item.index) - if err = aggTask.changes[fType].deleteFiles(); err != nil { - a.aggError <- fmt.Errorf("delete %sChanges: %w", fType.String(), err) - return - } - a.addLocked(fType, item) - } - // At this point, 3 new state files (containing latest changes) has been created for accounts, code, and storage - // Corresponding items has been added to the registy of state files, and B-tree are not necessary anymore, change files can be removed - // What follows can be performed by the 2nd background goroutine - select { - case a.mergeChannel <- struct{}{}: - default: - } - select { - case a.historyChannel <- struct{}{}: - default: - } - } -} - -type CommitmentValTransform struct { - pre [NumberOfAccountStorageTypes][]*byEndBlockItem // List of state files before the merge - post [NumberOfAccountStorageTypes][]*byEndBlockItem // List of state files after the merge -} - -func decodeU64(from []byte) uint64 { - var i uint64 - for _, b := range from { - i = (i << 8) | uint64(b) - } - return i -} - -func encodeU64(i uint64, to []byte) []byte { - // writes i to b in big endian byte order, using the least number of bytes needed to represent i. - switch { - case i < (1 << 8): - return append(to, byte(i)) - case i < (1 << 16): - return append(to, byte(i>>8), byte(i)) - case i < (1 << 24): - return append(to, byte(i>>16), byte(i>>8), byte(i)) - case i < (1 << 32): - return append(to, byte(i>>24), byte(i>>16), byte(i>>8), byte(i)) - case i < (1 << 40): - return append(to, byte(i>>32), byte(i>>24), byte(i>>16), byte(i>>8), byte(i)) - case i < (1 << 48): - return append(to, byte(i>>40), byte(i>>32), byte(i>>24), byte(i>>16), byte(i>>8), byte(i)) - case i < (1 << 56): - return append(to, byte(i>>48), byte(i>>40), byte(i>>32), byte(i>>24), byte(i>>16), byte(i>>8), byte(i)) - default: - return append(to, byte(i>>56), byte(i>>48), byte(i>>40), byte(i>>32), byte(i>>24), byte(i>>16), byte(i>>8), byte(i)) - } -} - -// commitmentValTransform parses the value of the commitment record to extract references -// to accounts and storage items, then looks them up in the new, merged files, and replaces them with -// the updated references -func (cvt *CommitmentValTransform) commitmentValTransform(val, transValBuf commitment.BranchData) ([]byte, error) { - if len(val) == 0 { - return transValBuf, nil - } - - accountPlainKeys, storagePlainKeys, err := val.ExtractPlainKeys() - if err != nil { - return nil, err - } - transAccountPks := make([][]byte, 0, len(accountPlainKeys)) - var apkBuf, spkBuf []byte - for _, accountPlainKey := range accountPlainKeys { - if len(accountPlainKey) == length.Addr { - // Non-optimised key originating from a database record - apkBuf = append(apkBuf[:0], accountPlainKey...) - } else { - // Optimised key referencing a state file record (file number and offset within the file) - fileI := int(accountPlainKey[0]) - offset := decodeU64(accountPlainKey[1:]) - g := cvt.pre[Account][fileI].getterMerge - g.Reset(offset) - apkBuf, _ = g.Next(apkBuf[:0]) - //fmt.Printf("replacing account [%x] from [%x]\n", apkBuf, accountPlainKey) - } - // Look up apkBuf in the post account files - for j := len(cvt.post[Account]); j > 0; j-- { - item := cvt.post[Account][j-1] - if item.index.Empty() { - continue - } - offset := item.readerMerge.Lookup(apkBuf) - g := item.getterMerge - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(apkBuf); keyMatch { - accountPlainKey = encodeU64(offset, []byte{byte(j - 1)}) - //fmt.Printf("replaced account [%x]=>[%x] for file [%d-%d]\n", apkBuf, accountPlainKey, item.startBlock, item.endBlock) - break - } else if j == 0 { - fmt.Printf("could not find replacement key [%x], file=%s.%d-%d]\n\n", apkBuf, Account.String(), item.startBlock, item.endBlock) - } - } - } - transAccountPks = append(transAccountPks, accountPlainKey) - } - - transStoragePks := make([][]byte, 0, len(storagePlainKeys)) - for _, storagePlainKey := range storagePlainKeys { - if len(storagePlainKey) == length.Addr+length.Hash { - // Non-optimised key originating from a database record - spkBuf = append(spkBuf[:0], storagePlainKey...) - } else { - // Optimised key referencing a state file record (file number and offset within the file) - fileI := int(storagePlainKey[0]) - offset := decodeU64(storagePlainKey[1:]) - g := cvt.pre[Storage][fileI].getterMerge - g.Reset(offset) - //fmt.Printf("offsetToKey storage [%x] offset=%d, file=%d-%d\n", storagePlainKey, offset, cvt.pre[Storage][fileI].startBlock, cvt.pre[Storage][fileI].endBlock) - spkBuf, _ = g.Next(spkBuf[:0]) - } - // Lookup spkBuf in the post storage files - for j := len(cvt.post[Storage]); j > 0; j-- { - item := cvt.post[Storage][j-1] - if item.index.Empty() { - continue - } - offset := item.readerMerge.Lookup(spkBuf) - g := item.getterMerge - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(spkBuf); keyMatch { - storagePlainKey = encodeU64(offset, []byte{byte(j - 1)}) - //fmt.Printf("replacing storage [%x] => [fileI=%d, offset=%d, file=%s.%d-%d]\n", spkBuf, j-1, offset, Storage.String(), item.startBlock, item.endBlock) - break - } else if j == 0 { - fmt.Printf("could not find replacement key [%x], file=%s.%d-%d]\n\n", spkBuf, Storage.String(), item.startBlock, item.endBlock) - } - } - } - transStoragePks = append(transStoragePks, storagePlainKey) - } - if transValBuf, err = val.ReplacePlainKeys(transAccountPks, transStoragePks, transValBuf); err != nil { - return nil, err - } - return transValBuf, nil -} - -func (a *Aggregator) backgroundMerge() { - defer a.mergeWg.Done() - for range a.mergeChannel { - t := time.Now() - var err error - var cvt CommitmentValTransform - var toRemove [NumberOfStateTypes][]*byEndBlockItem - var newItems [NumberOfStateTypes]*byEndBlockItem - var blockFrom, blockTo uint64 - lastType := Code - typesLimit := Commitment - if a.commitments { - lastType = Commitment - typesLimit = AccountHistory - } - // Lock the set of commitment (or code if commitments are off) files - those are the smallest, because account, storage and code files may be added by the aggregation thread first - toRemove[lastType], _, _, blockFrom, blockTo = a.findLargestMerge(lastType, uint64(math.MaxUint64) /* maxBlockTo */, uint64(math.MaxUint64) /* maxSpan */) - - for fType := FirstType; fType < typesLimit; fType++ { - var pre, post []*byEndBlockItem - var from, to uint64 - if fType == lastType { - from = blockFrom - to = blockTo - } else { - toRemove[fType], pre, post, from, to = a.findLargestMerge(fType, blockTo, uint64(math.MaxUint64) /* maxSpan */) - if from != blockFrom { - a.mergeError <- fmt.Errorf("%sFrom %d != blockFrom %d", fType.String(), from, blockFrom) - return - } - if to != blockTo { - a.mergeError <- fmt.Errorf("%sTo %d != blockTo %d", fType.String(), to, blockTo) - return - } - } - if len(toRemove[fType]) > 1 { - var valTransform func(commitment.BranchData, commitment.BranchData) ([]byte, error) - var mergeFunc commitmentMerger - if fType == Commitment { - valTransform = cvt.commitmentValTransform - mergeFunc = mergeCommitments - } else { - mergeFunc = mergeReplace - } - var prefixLen int - if fType == Storage { - prefixLen = length.Addr - } - if newItems[fType], err = a.computeAggregation(fType, toRemove[fType], from, to, valTransform, mergeFunc, true /* valCompressed */, true /* withIndex */, prefixLen); err != nil { - a.mergeError <- fmt.Errorf("computeAggreation %s: %w", fType.String(), err) - return - } - post = append(post, newItems[fType]) - } - if fType < NumberOfAccountStorageTypes { - cvt.pre[fType] = pre - cvt.post[fType] = post - } - } - // Switch aggregator to new state files, close and remove old files - a.removeLockedState(toRemove[Account], newItems[Account], toRemove[Code], newItems[Code], toRemove[Storage], newItems[Storage], toRemove[Commitment], newItems[Commitment]) - removed := 0 - for fType := FirstType; fType < typesLimit; fType++ { - if len(toRemove[fType]) > 1 { - removeFiles(fType, a.diffDir, toRemove[fType]) - removed += len(toRemove[fType]) - 1 - } - } - mergeTime := time.Since(t) - if mergeTime > time.Minute { - log.Info("Long merge", "from", blockFrom, "to", blockTo, "files", removed, "time", time.Since(t)) - } - } -} - -func (a *Aggregator) reduceHistoryFiles(fType FileType, item *byEndBlockItem) error { - datTmpPath := filepath.Join(a.diffDir, fmt.Sprintf("%s.%d-%d.dat.tmp", fType.String(), item.startBlock, item.endBlock)) - datPath := filepath.Join(a.diffDir, fmt.Sprintf("%s.%d-%d.dat", fType.String(), item.startBlock, item.endBlock)) - idxPath := filepath.Join(a.diffDir, fmt.Sprintf("%s.%d-%d.idx", fType.String(), item.startBlock, item.endBlock)) - comp, err := compress.NewCompressor(context.Background(), AggregatorPrefix, datTmpPath, a.diffDir, compress.MinPatternScore, 1, log.LvlDebug) - if err != nil { - return fmt.Errorf("reduceHistoryFiles create compressor %s: %w", datPath, err) - } - defer comp.Close() - g := item.getter - var val []byte - var count int - g.Reset(0) - var key []byte - for g.HasNext() { - g.Skip() // Skip key on on the first pass - val, _ = g.Next(val[:0]) - //fmt.Printf("reduce1 [%s.%d-%d] [%x]=>[%x]\n", fType.String(), item.startBlock, item.endBlock, key, val) - if err = comp.AddWord(val); err != nil { - return fmt.Errorf("reduceHistoryFiles AddWord: %w", err) - } - count++ - } - if err = comp.Compress(); err != nil { - return fmt.Errorf("reduceHistoryFiles compress: %w", err) - } - var d *compress.Decompressor - if d, err = compress.NewDecompressor(datTmpPath); err != nil { - return fmt.Errorf("reduceHistoryFiles create decompressor: %w", err) - } - var rs *recsplit.RecSplit - if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: count, - Enums: false, - BucketSize: 2000, - LeafSize: 8, - TmpDir: a.diffDir, - IndexFile: idxPath, - }); err != nil { - return fmt.Errorf("reduceHistoryFiles NewRecSplit: %w", err) - } - rs.LogLvl(log.LvlDebug) - - g1 := d.MakeGetter() - for { - g.Reset(0) - g1.Reset(0) - var lastOffset uint64 - for g.HasNext() { - key, _ = g.Next(key[:0]) - g.Skip() // Skip value - _, pos := g1.Next(nil) - //fmt.Printf("reduce2 [%s.%d-%d] [%x]==>%d\n", fType.String(), item.startBlock, item.endBlock, key, lastOffset) - if err = rs.AddKey(key, lastOffset); err != nil { - return fmt.Errorf("reduceHistoryFiles %p AddKey: %w", rs, err) - } - lastOffset = pos - } - if err = rs.Build(); err != nil { - if rs.Collision() { - log.Info("Building reduceHistoryFiles. Collision happened. It's ok. Restarting...") - rs.ResetNextSalt() - } else { - return fmt.Errorf("reduceHistoryFiles Build: %w", err) - } - } else { - break - } - } - if err = item.decompressor.Close(); err != nil { - return fmt.Errorf("reduceHistoryFiles close decompressor: %w", err) - } - if err = os.Remove(datPath); err != nil { - return fmt.Errorf("reduceHistoryFiles remove: %w", err) - } - if err = os.Rename(datTmpPath, datPath); err != nil { - return fmt.Errorf("reduceHistoryFiles rename: %w", err) - } - if item.decompressor, err = compress.NewDecompressor(datPath); err != nil { - return fmt.Errorf("reduceHistoryFiles create new decompressor: %w", err) - } - item.getter = item.decompressor.MakeGetter() - item.getterMerge = item.decompressor.MakeGetter() - if item.index, err = recsplit.OpenIndex(idxPath); err != nil { - return fmt.Errorf("reduceHistoryFiles open index: %w", err) - } - item.indexReader = recsplit.NewIndexReader(item.index) - item.readerMerge = recsplit.NewIndexReader(item.index) - return nil -} - -type commitmentMerger func(prev, current, target commitment.BranchData) (commitment.BranchData, error) - -func mergeReplace(preval, val, buf commitment.BranchData) (commitment.BranchData, error) { - return append(buf, val...), nil -} - -func mergeBitmaps(preval, val, buf commitment.BranchData) (commitment.BranchData, error) { - preef, _ := eliasfano32.ReadEliasFano(preval) - ef, _ := eliasfano32.ReadEliasFano(val) - //fmt.Printf("mergeBitmaps [%x] (count=%d,max=%d) + [%x] (count=%d,max=%d)\n", preval, preef.Count(), preef.Max(), val, ef.Count(), ef.Max()) - preIt := preef.Iterator() - efIt := ef.Iterator() - newEf := eliasfano32.NewEliasFano(preef.Count()+ef.Count(), ef.Max()) - for preIt.HasNext() { - v, _ := preIt.Next() - newEf.AddOffset(v) - } - for efIt.HasNext() { - v, _ := efIt.Next() - newEf.AddOffset(v) - } - newEf.Build() - return newEf.AppendBytes(buf), nil -} - -func mergeCommitments(preval, val, buf commitment.BranchData) (commitment.BranchData, error) { - return preval.MergeHexBranches(val, buf) -} - -func (a *Aggregator) backgroundHistoryMerge() { - defer a.historyWg.Done() - for range a.historyChannel { - t := time.Now() - var err error - var toRemove [NumberOfTypes][]*byEndBlockItem - var newItems [NumberOfTypes]*byEndBlockItem - var blockFrom, blockTo uint64 - // Lock the set of commitment files - those are the smallest, because account, storage and code files may be added by the aggregation thread first - toRemove[CodeBitmap], _, _, blockFrom, blockTo = a.findLargestMerge(CodeBitmap, uint64(math.MaxUint64) /* maxBlockTo */, 500_000 /* maxSpan */) - - finalMerge := blockTo-blockFrom+1 == 500_000 - for fType := AccountHistory; fType < NumberOfTypes; fType++ { - var from, to uint64 - if fType == CodeBitmap { - from = blockFrom - to = blockTo - } else { - toRemove[fType], _, _, from, to = a.findLargestMerge(fType, blockTo, 500_000 /* maxSpan */) - if from != blockFrom { - a.historyError <- fmt.Errorf("%sFrom %d != blockFrom %d", fType.String(), from, blockFrom) - return - } - if to != blockTo { - a.historyError <- fmt.Errorf("%sTo %d != blockTo %d", fType.String(), to, blockTo) - return - } - } - if len(toRemove[fType]) > 1 { - isBitmap := fType == AccountBitmap || fType == StorageBitmap || fType == CodeBitmap - - var mergeFunc commitmentMerger - switch { - case isBitmap: - mergeFunc = mergeBitmaps - case fType == Commitment: - mergeFunc = mergeCommitments - default: - mergeFunc = mergeReplace - } - - if newItems[fType], err = a.computeAggregation(fType, toRemove[fType], from, to, nil /* valTransform */, mergeFunc, - !isBitmap /* valCompressed */, !finalMerge || isBitmap /* withIndex */, 0 /* prefixLen */); err != nil { - a.historyError <- fmt.Errorf("computeAggreation %s: %w", fType.String(), err) - return - } - } - } - if finalMerge { - // Special aggregation for blockTo - blockFrom + 1 == 500_000 - // Remove keys from the .dat files assuming that they will only be used after querying the bitmap index - // and therefore, there is no situation where non-existent key is queried. - if err = a.reduceHistoryFiles(AccountHistory, newItems[AccountHistory]); err != nil { - a.historyError <- fmt.Errorf("reduceHistoryFiles %s: %w", AccountHistory.String(), err) - return - } - if err = a.reduceHistoryFiles(StorageHistory, newItems[StorageHistory]); err != nil { - a.historyError <- fmt.Errorf("reduceHistoryFiles %s: %w", StorageHistory.String(), err) - return - } - if err = a.reduceHistoryFiles(CodeHistory, newItems[CodeHistory]); err != nil { - a.historyError <- fmt.Errorf("reduceHistoryFiles %s: %w", CodeHistory.String(), err) - return - } - } - for fType := AccountHistory; fType < NumberOfTypes; fType++ { - a.removeLocked(fType, toRemove[fType], newItems[fType]) - } - removed := 0 - for fType := AccountHistory; fType < NumberOfTypes; fType++ { - if len(toRemove[fType]) > 1 { - removeFiles(fType, a.diffDir, toRemove[fType]) - removed += len(toRemove[fType]) - 1 - } - } - mergeTime := time.Since(t) - if mergeTime > time.Minute { - log.Info("Long history merge", "from", blockFrom, "to", blockTo, "files", removed, "time", time.Since(t)) - } - } -} - -// checkOverlaps does not lock tree, because it is only called from the constructor of aggregator -func checkOverlaps(treeName string, tree *btree.BTree) error { - var minStart uint64 = math.MaxUint64 - var err error - tree.Descend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.startBlock < minStart { - if item.endBlock >= minStart { - err = fmt.Errorf("overlap of %s state files [%d-%d] with %d", treeName, item.startBlock, item.endBlock, minStart) - return false - } - if minStart != math.MaxUint64 && item.endBlock+1 != minStart { - err = fmt.Errorf("hole in %s state files [%d-%d]", treeName, item.endBlock, minStart) - return false - } - minStart = item.startBlock - } - return true - }) - return err -} - -func (a *Aggregator) openFiles(fType FileType, minArch uint64) error { - var err error - var totalKeys uint64 - a.files[fType].Ascend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.decompressor, err = compress.NewDecompressor(path.Join(a.diffDir, fmt.Sprintf("%s.%d-%d.dat", fType.String(), item.startBlock, item.endBlock))); err != nil { - return false - } - if item.index, err = recsplit.OpenIndex(path.Join(a.diffDir, fmt.Sprintf("%s.%d-%d.idx", fType.String(), item.startBlock, item.endBlock))); err != nil { - return false - } - totalKeys += item.index.KeyCount() - item.getter = item.decompressor.MakeGetter() - item.getterMerge = item.decompressor.MakeGetter() - item.indexReader = recsplit.NewIndexReader(item.index) - item.readerMerge = recsplit.NewIndexReader(item.index) - return true - }) - if fType >= NumberOfStateTypes { - return nil - } - log.Info("Creating arch...", "type", fType.String(), "total keys in all state files", totalKeys) - // Allocate arch of double of total keys - n := totalKeys * 2 - if n < minArch { - n = minArch - } - a.arches[fType] = make([]uint32, n) - arch := a.arches[fType] - var key []byte - h := a.archHasher - collisions := 0 - a.files[fType].Ascend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - g := item.getter - g.Reset(0) - blockNum := uint32(item.endBlock) - for g.HasNext() { - key, _ = g.Next(key[:0]) - h.Reset() - h.Write(key) //nolint:errcheck - p, _ := h.Sum128() - p = p % n - if arch[p] != 0 { - collisions++ - } - arch[p] = blockNum - g.Skip() - } - return true - }) - log.Info("Created arch", "type", fType.String(), "collisions", collisions) - return err -} - -func (a *Aggregator) closeFiles(fType FileType) { - a.fileLocks[fType].Lock() - defer a.fileLocks[fType].Unlock() - a.files[fType].Ascend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.decompressor != nil { - item.decompressor.Close() - } - if item.index != nil { - item.index.Close() - } - return true - }) -} - -func (a *Aggregator) Close() { - close(a.aggChannel) - a.aggWg.Wait() // Need to wait for the background aggregation to finish because it sends to merge channels - // Drain channel before closing - select { - case <-a.mergeChannel: - default: - } - close(a.mergeChannel) - if a.changesets { - // Drain channel before closing - select { - case <-a.historyChannel: - default: - } - close(a.historyChannel) - a.historyWg.Wait() - } - a.mergeWg.Wait() - // Closing state files only after background aggregation goroutine is finished - for fType := FirstType; fType < NumberOfTypes; fType++ { - a.closeFiles(fType) - } -} - -// checkOverlapWithMinStart does not need to lock tree lock, because it is only used in the constructor of Aggregator -func checkOverlapWithMinStart(treeName string, tree *btree.BTree, minStart uint64) error { - if lastStateI := tree.Max(); lastStateI != nil { - item := lastStateI.(*byEndBlockItem) - if minStart != math.MaxUint64 && item.endBlock+1 != minStart { - return fmt.Errorf("hole or overlap between %s state files and change files [%d-%d]", treeName, item.endBlock, minStart) - } - } - return nil -} - -func (a *Aggregator) readFromFiles(fType FileType, lock bool, blockNum uint64, filekey []byte, trace bool) ([]byte, uint64) { - if lock { - if fType == Commitment { - for lockFType := FirstType; lockFType < NumberOfStateTypes; lockFType++ { - a.fileLocks[lockFType].RLock() - defer a.fileLocks[lockFType].RUnlock() - } - } else { - a.fileLocks[fType].RLock() - defer a.fileLocks[fType].RUnlock() - } - } - h := a.archHasher - arch := a.arches[fType] - n := uint64(len(arch)) - if n > 0 { - h.Reset() - h.Write(filekey) //nolint:errcheck - p, _ := h.Sum128() - p = p % n - v := uint64(atomic.LoadUint32(&arch[p])) - //fmt.Printf("Reading from %s arch key [%x]=%d, %d\n", fType.String(), filekey, p, arch[p]) - if v == 0 { - return nil, 0 - } - a.files[fType].AscendGreaterOrEqual(&byEndBlockItem{startBlock: v, endBlock: v}, func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.endBlock < blockNum { - blockNum = item.endBlock - } - return false - }) - } - var val []byte - var startBlock uint64 - a.files[fType].DescendLessOrEqual(&byEndBlockItem{endBlock: blockNum}, func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if trace { - fmt.Printf("read %s %x: search in file [%d-%d]\n", fType.String(), filekey, item.startBlock, item.endBlock) - } - if item.tree != nil { - ai, ok := item.tree.Get(&AggregateItem{k: filekey}) - if !ok { - return true - } - if ai == nil { - return true - } - val = ai.v - startBlock = item.startBlock - - return false - } - if item.index.Empty() { - return true - } - offset := item.indexReader.Lookup(filekey) - g := item.getter - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(filekey); keyMatch { - val, _ = g.Next(nil) - if trace { - fmt.Printf("read %s %x: found [%x] in file [%d-%d]\n", fType.String(), filekey, val, item.startBlock, item.endBlock) - } - startBlock = item.startBlock - atomic.AddUint64(&a.fileHits, 1) - return false - } - } - atomic.AddUint64(&a.fileMisses, 1) - return true - }) - - if fType == Commitment { - // Transform references - if len(val) > 0 { - accountPlainKeys, storagePlainKeys, err := commitment.BranchData(val).ExtractPlainKeys() - if err != nil { - panic(fmt.Errorf("value %x: %w", val, err)) - } - var transAccountPks [][]byte - var transStoragePks [][]byte - for _, accountPlainKey := range accountPlainKeys { - var apkBuf []byte - if len(accountPlainKey) == length.Addr { - // Non-optimised key originating from a database record - apkBuf = accountPlainKey - } else { - // Optimised key referencing a state file record (file number and offset within the file) - fileI := int(accountPlainKey[0]) - offset := decodeU64(accountPlainKey[1:]) - apkBuf, _ = a.readByOffset(Account, fileI, offset) - } - transAccountPks = append(transAccountPks, apkBuf) - } - for _, storagePlainKey := range storagePlainKeys { - var spkBuf []byte - if len(storagePlainKey) == length.Addr+length.Hash { - // Non-optimised key originating from a database record - spkBuf = storagePlainKey - } else { - // Optimised key referencing a state file record (file number and offset within the file) - fileI := int(storagePlainKey[0]) - offset := decodeU64(storagePlainKey[1:]) - //fmt.Printf("readbyOffset(comm file %d-%d) file=%d offset=%d\n", ii.startBlock, ii.endBlock, fileI, offset) - spkBuf, _ = a.readByOffset(Storage, fileI, offset) - } - transStoragePks = append(transStoragePks, spkBuf) - } - if val, err = commitment.BranchData(val).ReplacePlainKeys(transAccountPks, transStoragePks, nil); err != nil { - panic(err) - } - } - } - return val, startBlock -} - -// readByOffset is assumed to be invoked under a read lock -func (a *Aggregator) readByOffset(fType FileType, fileI int, offset uint64) ([]byte, []byte) { - var key, val []byte - fi := 0 - a.files[fType].Ascend(func(i btree.Item) bool { - if fi < fileI { - fi++ - return true - } - item := i.(*byEndBlockItem) - //fmt.Printf("fileI=%d, file=%s.%d-%d\n", fileI, fType.String(), item.startBlock, item.endBlock) - g := item.getter - g.Reset(offset) - key, _ = g.Next(nil) - val, _ = g.Next(nil) - - return false - }) - return key, val -} - -func (a *Aggregator) MakeStateReader(blockNum uint64, tx kv.Tx) *Reader { - r := &Reader{ - a: a, - blockNum: blockNum, - tx: tx, - } - return r -} - -type Reader struct { - a *Aggregator - tx kv.Getter - blockNum uint64 -} - -func (r *Reader) ReadAccountData(addr []byte, trace bool) ([]byte, error) { - v, err := r.tx.GetOne(kv.StateAccounts, addr) - if err != nil { - return nil, err - } - if v != nil { - return v[4:], nil - } - v, _ = r.a.readFromFiles(Account, true /* lock */, r.blockNum, addr, trace) - return v, nil -} - -func (r *Reader) ReadAccountStorage(addr []byte, loc []byte, trace bool) ([]byte, error) { - // Look in the summary table first - dbkey := make([]byte, len(addr)+len(loc)) - copy(dbkey[0:], addr) - copy(dbkey[len(addr):], loc) - v, err := r.tx.GetOne(kv.StateStorage, dbkey) - if err != nil { - return nil, err - } - if v != nil { - if len(v) == 4 { - return nil, nil - } - return v[4:], nil - } - v, _ = r.a.readFromFiles(Storage, true /* lock */, r.blockNum, dbkey, trace) - return v, nil -} - -func (r *Reader) ReadAccountCode(addr []byte, trace bool) ([]byte, error) { - // Look in the summary table first - v, err := r.tx.GetOne(kv.StateCode, addr) - if err != nil { - return nil, err - } - if v != nil { - if len(v) == 4 { - return nil, nil - } - return v[4:], nil - } - // Look in the files - v, _ = r.a.readFromFiles(Code, true /* lock */, r.blockNum, addr, trace) - return v, nil -} - -func (r *Reader) ReadAccountCodeSize(addr []byte, trace bool) (int, error) { - // Look in the summary table first - v, err := r.tx.GetOne(kv.StateCode, addr) - if err != nil { - return 0, err - } - if v != nil { - return len(v) - 4, nil - } - // Look in the files. TODO - use specialised function to only lookup size - v, _ = r.a.readFromFiles(Code, true /* lock */, r.blockNum, addr, trace) - return len(v), nil -} - -type Writer struct { - tx kv.RwTx - a *Aggregator - commTree *btree.BTreeG[*CommitmentItem] // BTree used for gathering commitment data - changes [NumberOfStateTypes]Changes - blockNum uint64 - changeFileNum uint64 // Block number associated with the current change files. It is the last block number whose changes will go into that file -} - -func (a *Aggregator) MakeStateWriter(beforeOn bool) *Writer { - w := &Writer{ - a: a, - commTree: btree.NewG[*CommitmentItem](32, commitmentItemLess), - } - for fType := FirstType; fType < NumberOfStateTypes; fType++ { - w.changes[fType].Init(fType.String(), a.aggregationStep, a.diffDir, w.a.changesets && fType != Commitment /* we do not unwind commitment ? */) - } - return w -} - -func (w *Writer) Close() { - typesLimit := Commitment - if w.a.commitments { - typesLimit = AccountHistory - } - for fType := FirstType; fType < typesLimit; fType++ { - w.changes[fType].closeFiles() - } -} - -func (w *Writer) Reset(blockNum uint64, tx kv.RwTx) error { - w.tx = tx - w.blockNum = blockNum - typesLimit := Commitment - if w.a.commitments { - typesLimit = AccountHistory - } - if blockNum > w.changeFileNum { - for fType := FirstType; fType < typesLimit; fType++ { - if err := w.changes[fType].closeFiles(); err != nil { - return err - } - } - if w.changeFileNum != 0 { - w.a.changesBtree.ReplaceOrInsert(&ChangesItem{startBlock: w.changeFileNum + 1 - w.a.aggregationStep, endBlock: w.changeFileNum, fileCount: 12}) - } - } - if w.changeFileNum == 0 || blockNum > w.changeFileNum { - for fType := FirstType; fType < typesLimit; fType++ { - if err := w.changes[fType].openFiles(blockNum, true /* write */); err != nil { - return err - } - } - w.changeFileNum = blockNum - (blockNum % w.a.aggregationStep) + w.a.aggregationStep - 1 - } - return nil -} - -type CommitmentItem struct { - plainKey []byte - hashedKey []byte - u commitment.Update -} - -func commitmentItemLess(i, j *CommitmentItem) bool { - return bytes.Compare(i.hashedKey, j.hashedKey) < 0 -} -func (i *CommitmentItem) Less(than btree.Item) bool { - return bytes.Compare(i.hashedKey, than.(*CommitmentItem).hashedKey) < 0 -} - -func (w *Writer) branchFn(prefix []byte) ([]byte, error) { - for lockFType := FirstType; lockFType < NumberOfStateTypes; lockFType++ { - w.a.fileLocks[lockFType].RLock() - defer w.a.fileLocks[lockFType].RUnlock() - } - // Look in the summary table first - mergedVal, err := w.tx.GetOne(kv.StateCommitment, prefix) - if err != nil { - return nil, err - } - if mergedVal != nil { - mergedVal = mergedVal[4:] - } - // Look in the files and merge, while it becomes complete - var startBlock = w.blockNum + 1 - for mergedVal == nil || !commitment.BranchData(mergedVal).IsComplete() { - if startBlock == 0 { - panic(fmt.Sprintf("Incomplete branch data prefix [%x], mergeVal=[%x], startBlock=%d\n", commitment.CompactedKeyToHex(prefix), mergedVal, startBlock)) - } - var val commitment.BranchData - val, startBlock = w.a.readFromFiles(Commitment, false /* lock */, startBlock-1, prefix, false /* trace */) - if val == nil { - if mergedVal == nil { - return nil, nil - } - panic(fmt.Sprintf("Incomplete branch data prefix [%x], mergeVal=[%x], startBlock=%d\n", commitment.CompactedKeyToHex(prefix), mergedVal, startBlock)) - } - var err error - //fmt.Printf("Pre-merge prefix [%x] [%x]+[%x], startBlock %d\n", commitment.CompactToHex(prefix), val, mergedVal, startBlock) - if mergedVal == nil { - mergedVal = val - } else if mergedVal, err = val.MergeHexBranches(mergedVal, nil); err != nil { - return nil, err - } - //fmt.Printf("Post-merge prefix [%x] [%x], startBlock %d\n", commitment.CompactToHex(prefix), mergedVal, startBlock) - } - if mergedVal == nil { - return nil, nil - } - //fmt.Printf("Returning branch data prefix [%x], mergeVal=[%x], startBlock=%d\n", commitment.CompactToHex(prefix), mergedVal, startBlock) - return mergedVal[2:], nil // Skip touchMap but keep afterMap -} - -func bytesToUint64(buf []byte) (x uint64) { - for i, b := range buf { - x = x<<8 + uint64(b) - if i == 7 { - return - } - } - return -} - -func (w *Writer) accountFn(plainKey []byte, cell *commitment.Cell) error { - // Look in the summary table first - enc, err := w.tx.GetOne(kv.StateAccounts, plainKey) - if err != nil { - return err - } - if enc != nil { - enc = enc[4:] - } else { - // Look in the files - enc, _ = w.a.readFromFiles(Account, true /* lock */, w.blockNum, plainKey, false /* trace */) - } - cell.Nonce = 0 - cell.Balance.Clear() - copy(cell.CodeHash[:], commitment.EmptyCodeHash) - - if len(enc) > 0 { - pos := 0 - nonceBytes := int(enc[pos]) - pos++ - if nonceBytes > 0 { - cell.Nonce = bytesToUint64(enc[pos : pos+nonceBytes]) - pos += nonceBytes - } - balanceBytes := int(enc[pos]) - pos++ - if balanceBytes > 0 { - cell.Balance.SetBytes(enc[pos : pos+balanceBytes]) - } - } - enc, err = w.tx.GetOne(kv.StateCode, plainKey) - if err != nil { - return err - } - if enc != nil { - enc = enc[4:] - } else { - // Look in the files - enc, _ = w.a.readFromFiles(Code, true /* lock */, w.blockNum, plainKey, false /* trace */) - } - if len(enc) > 0 { - w.a.keccak.Reset() - w.a.keccak.Write(enc) - w.a.keccak.(io.Reader).Read(cell.CodeHash[:]) - } - return nil -} - -func (w *Writer) storageFn(plainKey []byte, cell *commitment.Cell) error { - // Look in the summary table first - enc, err := w.tx.GetOne(kv.StateStorage, plainKey) - if err != nil { - return err - } - if enc != nil { - enc = enc[4:] - } else { - // Look in the files - enc, _ = w.a.readFromFiles(Storage, true /* lock */, w.blockNum, plainKey, false /* trace */) - } - cell.StorageLen = len(enc) - copy(cell.Storage[:], enc) - return nil -} - -func (w *Writer) captureCommitmentType(fType FileType, trace bool, f func(commTree *btree.BTreeG[*CommitmentItem], h hash.Hash, key, val []byte)) { - lastOffsetKey := 0 - lastOffsetVal := 0 - for i, offsetKey := range w.changes[fType].keys.wordOffsets { - offsetVal := w.changes[fType].after.wordOffsets[i] - key := w.changes[fType].keys.words[lastOffsetKey:offsetKey] - val := w.changes[fType].after.words[lastOffsetVal:offsetVal] - if trace { - fmt.Printf("captureCommitmentData %s [%x]=>[%x]\n", fType.String(), key, val) - } - f(w.commTree, w.a.keccak, key, val) - lastOffsetKey = offsetKey - lastOffsetVal = offsetVal - } -} - -func (w *Writer) captureCommitmentData(trace bool) { - if trace { - fmt.Printf("captureCommitmentData start w.commTree.Len()=%d\n", w.commTree.Len()) - } - w.captureCommitmentType(Code, trace, func(commTree *btree.BTreeG[*CommitmentItem], h hash.Hash, key, val []byte) { - h.Reset() - h.Write(key) - hashedKey := h.Sum(nil) - var c = &CommitmentItem{plainKey: common.Copy(key), hashedKey: make([]byte, len(hashedKey)*2)} - for i, b := range hashedKey { - c.hashedKey[i*2] = (b >> 4) & 0xf - c.hashedKey[i*2+1] = b & 0xf - } - c.u.Flags = commitment.CODE_UPDATE - item, found := commTree.Get(&CommitmentItem{hashedKey: c.hashedKey}) - if found && item != nil { - if item.u.Flags&commitment.BALANCE_UPDATE != 0 { - c.u.Flags |= commitment.BALANCE_UPDATE - c.u.Balance.Set(&item.u.Balance) - } - if item.u.Flags&commitment.NONCE_UPDATE != 0 { - c.u.Flags |= commitment.NONCE_UPDATE - c.u.Nonce = item.u.Nonce - } - if item.u.Flags == commitment.DELETE_UPDATE && len(val) == 0 { - c.u.Flags = commitment.DELETE_UPDATE - } else { - h.Reset() - h.Write(val) - h.(io.Reader).Read(c.u.CodeHashOrStorage[:]) - } - } else { - h.Reset() - h.Write(val) - h.(io.Reader).Read(c.u.CodeHashOrStorage[:]) - } - commTree.ReplaceOrInsert(c) - }) - w.captureCommitmentType(Account, trace, func(commTree *btree.BTreeG[*CommitmentItem], h hash.Hash, key, val []byte) { - h.Reset() - h.Write(key) - hashedKey := h.Sum(nil) - var c = &CommitmentItem{plainKey: common.Copy(key), hashedKey: make([]byte, len(hashedKey)*2)} - for i, b := range hashedKey { - c.hashedKey[i*2] = (b >> 4) & 0xf - c.hashedKey[i*2+1] = b & 0xf - } - if len(val) == 0 { - c.u.Flags = commitment.DELETE_UPDATE - } else { - c.u.DecodeForStorage(val) - c.u.Flags = commitment.BALANCE_UPDATE | commitment.NONCE_UPDATE - item, found := commTree.Get(&CommitmentItem{hashedKey: c.hashedKey}) - - if found && item != nil { - if item.u.Flags&commitment.CODE_UPDATE != 0 { - c.u.Flags |= commitment.CODE_UPDATE - copy(c.u.CodeHashOrStorage[:], item.u.CodeHashOrStorage[:]) - } - } - } - commTree.ReplaceOrInsert(c) - }) - w.captureCommitmentType(Storage, trace, func(commTree *btree.BTreeG[*CommitmentItem], h hash.Hash, key, val []byte) { - hashedKey := make([]byte, 2*length.Hash) - h.Reset() - h.Write(key[:length.Addr]) - h.(io.Reader).Read(hashedKey[:length.Hash]) - h.Reset() - h.Write(key[length.Addr:]) - h.(io.Reader).Read(hashedKey[length.Hash:]) - var c = &CommitmentItem{plainKey: common.Copy(key), hashedKey: make([]byte, len(hashedKey)*2)} - for i, b := range hashedKey { - c.hashedKey[i*2] = (b >> 4) & 0xf - c.hashedKey[i*2+1] = b & 0xf - } - c.u.ValLength = len(val) - if len(val) > 0 { - copy(c.u.CodeHashOrStorage[:], val) - } - if len(val) == 0 { - c.u.Flags = commitment.DELETE_UPDATE - } else { - c.u.Flags = commitment.STORAGE_UPDATE - } - commTree.ReplaceOrInsert(c) - }) - if trace { - fmt.Printf("captureCommitmentData end w.commTree.Len()=%d\n", w.commTree.Len()) - } -} - -// computeCommitment is computing the commitment to the state after -// the change would have been applied. -// It assumes that the state accessible via the aggregator has already been -// modified with the new values -// At the moment, it is specific version for hex merkle patricia tree commitment -// but it will be extended to support other types of commitments -func (w *Writer) computeCommitment(trace bool) ([]byte, error) { - if trace { - fmt.Printf("computeCommitment w.commTree.Len()=%d\n", w.commTree.Len()) - } - - plainKeys := make([][]byte, w.commTree.Len()) - hashedKeys := make([][]byte, w.commTree.Len()) - updates := make([]commitment.Update, w.commTree.Len()) - j := 0 - w.commTree.Ascend(func(item *CommitmentItem) bool { - plainKeys[j] = item.plainKey - hashedKeys[j] = item.hashedKey - updates[j] = item.u - j++ - return true - }) - - if len(plainKeys) == 0 { - return w.a.hph.RootHash() - } - - w.a.hph.Reset() - w.a.hph.ResetFns(w.branchFn, w.accountFn, w.storageFn) - w.a.hph.SetTrace(trace) - - rootHash, branchNodeUpdates, err := w.a.hph.ProcessUpdates(plainKeys, hashedKeys, updates) - if err != nil { - return nil, err - } - - for prefixStr, branchNodeUpdate := range branchNodeUpdates { - if branchNodeUpdate == nil { - continue - } - prefix := []byte(prefixStr) - var prevV []byte - var prevNum uint32 - if prevV, err = w.tx.GetOne(kv.StateCommitment, prefix); err != nil { - return nil, err - } - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - - var original commitment.BranchData - if prevV == nil { - original, _ = w.a.readFromFiles(Commitment, true /* lock */, w.blockNum, prefix, false) - } else { - original = prevV[4:] - } - if original != nil { - // try to merge previous (original) and current (branchNodeUpdate) into one update - mergedVal, err := original.MergeHexBranches(branchNodeUpdate, nil) - if err != nil { - return nil, err - } - if w.a.trace { - fmt.Printf("computeCommitment merge [%x] [%x]+[%x]=>[%x]\n", commitment.CompactedKeyToHex(prefix), original, branchNodeUpdate, mergedVal) - } - branchNodeUpdate = mergedVal - } - - //fmt.Printf("computeCommitment set [%x] [%x]\n", commitment.CompactToHex(prefix), branchNodeUpdate) - v := make([]byte, 4+len(branchNodeUpdate)) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - copy(v[4:], branchNodeUpdate) - - if err = w.tx.Put(kv.StateCommitment, prefix, v); err != nil { - return nil, err - } - if len(branchNodeUpdate) == 0 { - w.changes[Commitment].delete(prefix, original) - } else { - if prevV == nil && len(original) == 0 { - w.changes[Commitment].insert(prefix, branchNodeUpdate) - } else { - w.changes[Commitment].update(prefix, original, branchNodeUpdate) - } - } - } - - return rootHash, nil -} - -func (w *Writer) FinishTx(txNum uint64, trace bool) error { - if w.a.commitments { - w.captureCommitmentData(trace) - } - var err error - for fType := FirstType; fType < Commitment; fType++ { - if err = w.changes[fType].finish(txNum); err != nil { - return fmt.Errorf("finish %sChanges: %w", fType.String(), err) - } - } - return nil -} - -func (w *Writer) ComputeCommitment(trace bool) ([]byte, error) { - if !w.a.commitments { - return nil, fmt.Errorf("commitments turned off") - } - comm, err := w.computeCommitment(trace) - if err != nil { - return nil, fmt.Errorf("compute commitment: %w", err) - } - w.commTree.Clear(true) - if err = w.changes[Commitment].finish(w.blockNum); err != nil { - return nil, fmt.Errorf("finish commChanges: %w", err) - } - return comm, nil -} - -// Aggegate should be called to check if the aggregation is required, and -// if it is required, perform it -func (w *Writer) Aggregate(trace bool) error { - if w.blockNum < w.a.unwindLimit+w.a.aggregationStep-1 { - return nil - } - diff := w.blockNum - w.a.unwindLimit - if (diff+1)%w.a.aggregationStep != 0 { - return nil - } - if err := w.aggregateUpto(diff+1-w.a.aggregationStep, diff); err != nil { - return fmt.Errorf("aggregateUpto(%d, %d): %w", diff+1-w.a.aggregationStep, diff, err) - } - return nil -} - -func (w *Writer) UpdateAccountData(addr []byte, account []byte, trace bool) error { - var prevNum uint32 - prevV, err := w.tx.GetOne(kv.StateAccounts, addr) - if err != nil { - return err - } - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - var original []byte - if prevV == nil { - original, _ = w.a.readFromFiles(Account, true /* lock */, w.blockNum, addr, trace) - } else { - original = prevV[4:] - } - if bytes.Equal(account, original) { - // No change - return nil - } - v := make([]byte, 4+len(account)) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - copy(v[4:], account) - if err = w.tx.Put(kv.StateAccounts, addr, v); err != nil { - return err - } - if prevV == nil && len(original) == 0 { - w.changes[Account].insert(addr, account) - } else { - w.changes[Account].update(addr, original, account) - } - if trace { - w.a.trace = true - w.a.tracedKeys[string(addr)] = struct{}{} - } - return nil -} - -func (w *Writer) UpdateAccountCode(addr []byte, code []byte, trace bool) error { - var prevNum uint32 - prevV, err := w.tx.GetOne(kv.StateCode, addr) - if err != nil { - return err - } - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - var original []byte - if prevV == nil { - original, _ = w.a.readFromFiles(Code, true /* lock */, w.blockNum, addr, trace) - } else { - original = prevV[4:] - } - v := make([]byte, 4+len(code)) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - copy(v[4:], code) - if err = w.tx.Put(kv.StateCode, addr, v); err != nil { - return err - } - if prevV == nil && len(original) == 0 { - w.changes[Code].insert(addr, code) - } else { - w.changes[Code].update(addr, original, code) - } - if trace { - w.a.trace = true - w.a.tracedKeys[string(addr)] = struct{}{} - } - return nil -} - -type CursorType uint8 - -const ( - FILE_CURSOR CursorType = iota - DB_CURSOR - TREE_CURSOR -) - -// CursorItem is the item in the priority queue used to do merge interation -// over storage of a given account -type CursorItem struct { - c kv.Cursor - dg *compress.Getter - tree *btree.BTreeG[*AggregateItem] - key []byte - val []byte - endBlock uint64 - t CursorType // Whether this item represents state file or DB record, or tree -} - -type CursorHeap []*CursorItem - -func (ch CursorHeap) Len() int { - return len(ch) -} - -func (ch CursorHeap) Less(i, j int) bool { - cmp := bytes.Compare(ch[i].key, ch[j].key) - if cmp == 0 { - // when keys match, the items with later blocks are preferred - return ch[i].endBlock > ch[j].endBlock - } - return cmp < 0 -} - -func (ch *CursorHeap) Swap(i, j int) { - (*ch)[i], (*ch)[j] = (*ch)[j], (*ch)[i] -} - -func (ch *CursorHeap) Push(x interface{}) { - *ch = append(*ch, x.(*CursorItem)) -} - -func (ch *CursorHeap) Pop() interface{} { - old := *ch - n := len(old) - x := old[n-1] - old[n-1] = nil - *ch = old[0 : n-1] - return x -} - -func (w *Writer) deleteAccount(addr []byte, trace bool) (bool, error) { - prevV, err := w.tx.GetOne(kv.StateAccounts, addr) - if err != nil { - return false, err - } - var prevNum uint32 - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - var original []byte - if prevV == nil { - original, _ = w.a.readFromFiles(Account, true /* lock */, w.blockNum, addr, trace) - if original == nil { - return false, nil - } - } else { - original = prevV[4:] - } - v := make([]byte, 4) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - if err = w.tx.Put(kv.StateAccounts, addr, v); err != nil { - return false, err - } - w.changes[Account].delete(addr, original) - return true, nil -} - -func (w *Writer) deleteCode(addr []byte, trace bool) error { - prevV, err := w.tx.GetOne(kv.StateCode, addr) - if err != nil { - return err - } - var prevNum uint32 - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - var original []byte - if prevV == nil { - original, _ = w.a.readFromFiles(Code, true /* lock */, w.blockNum, addr, trace) - if original == nil { - // Nothing to do - return nil - } - } else { - original = prevV[4:] - } - v := make([]byte, 4) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - if err = w.tx.Put(kv.StateCode, addr, v); err != nil { - return err - } - w.changes[Code].delete(addr, original) - return nil -} - -func (w *Writer) DeleteAccount(addr []byte, trace bool) error { - deleted, err := w.deleteAccount(addr, trace) - if err != nil { - return err - } - if !deleted { - return nil - } - w.a.fileLocks[Storage].RLock() - defer w.a.fileLocks[Storage].RUnlock() - w.deleteCode(addr, trace) - // Find all storage items for this address - var cp CursorHeap - heap.Init(&cp) - var c kv.Cursor - if c, err = w.tx.Cursor(kv.StateStorage); err != nil { - return err - } - defer c.Close() - var k, v []byte - if k, v, err = c.Seek(addr); err != nil { - return err - } - if k != nil && bytes.HasPrefix(k, addr) { - heap.Push(&cp, &CursorItem{t: DB_CURSOR, key: common.Copy(k), val: common.Copy(v), c: c, endBlock: w.blockNum}) - } - w.a.files[Storage].Ascend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.tree != nil { - item.tree.AscendGreaterOrEqual(&AggregateItem{k: addr}, func(aitem *AggregateItem) bool { - if !bytes.HasPrefix(aitem.k, addr) { - return false - } - if len(aitem.k) == len(addr) { - return true - } - heap.Push(&cp, &CursorItem{t: TREE_CURSOR, key: aitem.k, val: aitem.v, tree: item.tree, endBlock: item.endBlock}) - return false - }) - return true - } - if item.index.Empty() { - return true - } - offset := item.indexReader.Lookup(addr) - g := item.getter - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(addr); !keyMatch { - //fmt.Printf("DeleteAccount %x - not found anchor in file [%d-%d]\n", addr, item.startBlock, item.endBlock) - return true - } - g.Skip() - } - if g.HasNext() { - key, _ := g.Next(nil) - if bytes.HasPrefix(key, addr) { - val, _ := g.Next(nil) - heap.Push(&cp, &CursorItem{t: FILE_CURSOR, key: key, val: val, dg: g, endBlock: item.endBlock}) - } - } - return true - }) - for cp.Len() > 0 { - lastKey := common.Copy(cp[0].key) - lastVal := common.Copy(cp[0].val) - // Advance all the items that have this key (including the top) - for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] - switch ci1.t { - case FILE_CURSOR: - if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.Next(ci1.key[:0]) - if bytes.HasPrefix(ci1.key, addr) { - ci1.val, _ = ci1.dg.Next(ci1.val[:0]) - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) - } - } else { - heap.Pop(&cp) - } - case DB_CURSOR: - k, v, err = ci1.c.Next() - if err != nil { - return err - } - if k != nil && bytes.HasPrefix(k, addr) { - ci1.key = common.Copy(k) - ci1.val = common.Copy(v) - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) - } - case TREE_CURSOR: - skip := true - var aitem *AggregateItem - ci1.tree.AscendGreaterOrEqual(&AggregateItem{k: ci1.key}, func(ai *AggregateItem) bool { - if skip { - skip = false - return true - } - aitem = ai - return false - }) - if aitem != nil && bytes.HasPrefix(aitem.k, addr) { - ci1.key = aitem.k - ci1.val = aitem.v - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) - } - } - } - var prevV []byte - prevV, err = w.tx.GetOne(kv.StateStorage, lastKey) - if err != nil { - return err - } - var prevNum uint32 - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - v = make([]byte, 4) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - if err = w.tx.Put(kv.StateStorage, lastKey, v); err != nil { - return err - } - w.changes[Storage].delete(lastKey, lastVal) - } - if trace { - w.a.trace = true - w.a.tracedKeys[string(addr)] = struct{}{} - } - return nil -} - -func (w *Writer) WriteAccountStorage(addr, loc []byte, value []byte, trace bool) error { - dbkey := make([]byte, len(addr)+len(loc)) - copy(dbkey[0:], addr) - copy(dbkey[len(addr):], loc) - prevV, err := w.tx.GetOne(kv.StateStorage, dbkey) - if err != nil { - return err - } - var prevNum uint32 - if prevV != nil { - prevNum = binary.BigEndian.Uint32(prevV[:4]) - } - var original []byte - if prevV == nil { - original, _ = w.a.readFromFiles(Storage, true /* lock */, w.blockNum, dbkey, trace) - } else { - original = prevV[4:] - } - if bytes.Equal(value, original) { - // No change - return nil - } - v := make([]byte, 4+len(value)) - binary.BigEndian.PutUint32(v[:4], prevNum+1) - copy(v[4:], value) - if err = w.tx.Put(kv.StateStorage, dbkey, v); err != nil { - return err - } - if prevV == nil && len(original) == 0 { - w.changes[Storage].insert(dbkey, value) - } else { - w.changes[Storage].update(dbkey, original, value) - } - if trace { - w.a.trace = true - w.a.tracedKeys[string(dbkey)] = struct{}{} - } - return nil -} - -// findLargestMerge looks through the state files of the speficied type and determines the largest merge that can be undertaken -// a state file block [a; b] is valid if its length is a divisor of its starting block, or `(b-a+1) = 0 mod a` -func (a *Aggregator) findLargestMerge(fType FileType, maxTo uint64, maxSpan uint64) (toAggregate []*byEndBlockItem, pre []*byEndBlockItem, post []*byEndBlockItem, aggFrom uint64, aggTo uint64) { - a.fileLocks[fType].RLock() - defer a.fileLocks[fType].RUnlock() - var maxEndBlock uint64 - a.files[fType].DescendLessOrEqual(&byEndBlockItem{endBlock: maxTo}, func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.decompressor == nil { - return true - } - maxEndBlock = item.endBlock - return false - }) - if maxEndBlock == 0 { - return - } - a.files[fType].Ascend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.decompressor == nil { - return true // Skip B-tree based items - } - pre = append(pre, item) - if aggTo == 0 { - var doubleEnd uint64 - nextDouble := item.endBlock - for nextDouble <= maxEndBlock && nextDouble-item.startBlock < maxSpan { - doubleEnd = nextDouble - nextDouble = doubleEnd + (doubleEnd - item.startBlock) + 1 - } - if doubleEnd != item.endBlock { - aggFrom = item.startBlock - aggTo = doubleEnd - } else { - post = append(post, item) - return true - } - } - toAggregate = append(toAggregate, item) - return item.endBlock < aggTo - }) - return -} - -func (a *Aggregator) computeAggregation(fType FileType, - toAggregate []*byEndBlockItem, aggFrom uint64, aggTo uint64, - valTransform func(val, transValBuf commitment.BranchData) ([]byte, error), - mergeFunc commitmentMerger, - valCompressed bool, - withIndex bool, prefixLen int) (*byEndBlockItem, error) { - var item2 = &byEndBlockItem{startBlock: aggFrom, endBlock: aggTo} - var cp CursorHeap - heap.Init(&cp) - for _, ag := range toAggregate { - g := ag.decompressor.MakeGetter() - g.Reset(0) - if g.HasNext() { - key, _ := g.Next(nil) - val, _ := g.Next(nil) - heap.Push(&cp, &CursorItem{t: FILE_CURSOR, dg: g, key: key, val: val, endBlock: ag.endBlock}) - } - } - var err error - var count int - if item2.decompressor, count, err = a.mergeIntoStateFile(&cp, prefixLen, fType, aggFrom, aggTo, a.diffDir, valTransform, mergeFunc, valCompressed); err != nil { - return nil, fmt.Errorf("mergeIntoStateFile %s [%d-%d]: %w", fType.String(), aggFrom, aggTo, err) - } - item2.getter = item2.decompressor.MakeGetter() - item2.getterMerge = item2.decompressor.MakeGetter() - if withIndex { - idxPath := filepath.Join(a.diffDir, fmt.Sprintf("%s.%d-%d.idx", fType.String(), aggFrom, aggTo)) - if item2.index, err = buildIndex(item2.decompressor, idxPath, a.diffDir, count); err != nil { - return nil, fmt.Errorf("mergeIntoStateFile buildIndex %s [%d-%d]: %w", fType.String(), aggFrom, aggTo, err) - } - item2.indexReader = recsplit.NewIndexReader(item2.index) - item2.readerMerge = recsplit.NewIndexReader(item2.index) - } - return item2, nil -} - -func createDatAndIndex(treeName string, diffDir string, bt *btree.BTreeG[*AggregateItem], blockFrom uint64, blockTo uint64) (*compress.Decompressor, *recsplit.Index, error) { - datPath := filepath.Join(diffDir, fmt.Sprintf("%s.%d-%d.dat", treeName, blockFrom, blockTo)) - idxPath := filepath.Join(diffDir, fmt.Sprintf("%s.%d-%d.idx", treeName, blockFrom, blockTo)) - count, err := btreeToFile(bt, datPath, diffDir, false /* trace */, 1 /* workers */) - if err != nil { - return nil, nil, fmt.Errorf("createDatAndIndex %s build btree: %w", treeName, err) - } - var d *compress.Decompressor - if d, err = compress.NewDecompressor(datPath); err != nil { - return nil, nil, fmt.Errorf("createDatAndIndex %s decompressor: %w", treeName, err) - } - var index *recsplit.Index - if index, err = buildIndex(d, idxPath, diffDir, count); err != nil { - return nil, nil, fmt.Errorf("createDatAndIndex %s buildIndex: %w", treeName, err) - } - return d, index, nil -} - -func (a *Aggregator) addLocked(fType FileType, item *byEndBlockItem) { - a.fileLocks[fType].Lock() - defer a.fileLocks[fType].Unlock() - a.files[fType].ReplaceOrInsert(item) -} - -func (w *Writer) aggregateUpto(blockFrom, blockTo uint64) error { - // React on any previous error of aggregation or merge - select { - case err := <-w.a.aggError: - return err - case err := <-w.a.mergeError: - return err - case err := <-w.a.historyError: - return err - default: - } - typesLimit := Commitment - if w.a.commitments { - typesLimit = AccountHistory - } - t0 := time.Now() - t := time.Now() - i := w.a.changesBtree.Get(&ChangesItem{startBlock: blockFrom, endBlock: blockTo}) - if i == nil { - return fmt.Errorf("did not find change files for [%d-%d], w.a.changesBtree.Len() = %d", blockFrom, blockTo, w.a.changesBtree.Len()) - } - item := i.(*ChangesItem) - if item.startBlock != blockFrom { - return fmt.Errorf("expected change files[%d-%d], got [%d-%d]", blockFrom, blockTo, item.startBlock, item.endBlock) - } - w.a.changesBtree.Delete(i) - var aggTask AggregationTask - for fType := FirstType; fType < typesLimit; fType++ { - aggTask.changes[fType].Init(fType.String(), w.a.aggregationStep, w.a.diffDir, w.a.changesets && fType != Commitment) - } - var err error - for fType := FirstType; fType < typesLimit; fType++ { - var prefixLen int - if fType == Storage { - prefixLen = length.Addr - } - - var commitMerger commitmentMerger - if fType == Commitment { - commitMerger = mergeCommitments - } - - if aggTask.bt[fType], err = aggTask.changes[fType].aggregate(blockFrom, blockTo, prefixLen, w.tx, fType.Table(), commitMerger); err != nil { - return fmt.Errorf("aggregate %sChanges: %w", fType.String(), err) - } - } - aggTask.blockFrom = blockFrom - aggTask.blockTo = blockTo - aggTime := time.Since(t) - t = time.Now() - // At this point, all the changes are gathered in 4 B-trees (accounts, code, storage and commitment) and removed from the database - // What follows can be done in the 1st background goroutine - for fType := FirstType; fType < typesLimit; fType++ { - if fType < NumberOfStateTypes { - w.a.updateArch(aggTask.bt[fType], fType, uint32(aggTask.blockTo)) - } - } - updateArchTime := time.Since(t) - t = time.Now() - for fType := FirstType; fType < typesLimit; fType++ { - w.a.addLocked(fType, &byEndBlockItem{startBlock: aggTask.blockFrom, endBlock: aggTask.blockTo, tree: aggTask.bt[fType]}) - } - switchTime := time.Since(t) - w.a.aggChannel <- &aggTask - handoverTime := time.Since(t0) - if handoverTime > time.Second { - log.Info("Long handover to background aggregation", "from", blockFrom, "to", blockTo, "composition", aggTime, "arch update", updateArchTime, "switch", switchTime) - } - return nil -} - -// mergeIntoStateFile assumes that all entries in the cp heap have type FILE_CURSOR -func (a *Aggregator) mergeIntoStateFile(cp *CursorHeap, prefixLen int, - fType FileType, startBlock, endBlock uint64, dir string, - valTransform func(val, transValBuf commitment.BranchData) ([]byte, error), - mergeFunc commitmentMerger, - valCompressed bool, -) (*compress.Decompressor, int, error) { - datPath := filepath.Join(dir, fmt.Sprintf("%s.%d-%d.dat", fType.String(), startBlock, endBlock)) - comp, err := compress.NewCompressor(context.Background(), AggregatorPrefix, datPath, dir, compress.MinPatternScore, 1, log.LvlDebug) - if err != nil { - return nil, 0, fmt.Errorf("compressor %s: %w", datPath, err) - } - defer comp.Close() - count := 0 - // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. - // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away - // instead, the pair from the previous iteration is processed first - `keyBuf=>valBuf`. After that, `keyBuf` and `valBuf` are assigned - // to `lastKey` and `lastVal` correspondingly, and the next step of multi-way merge happens. Therefore, after the multi-way merge loop - // (when CursorHeap cp is empty), there is a need to process the last pair `keyBuf=>valBuf`, because it was one step behind - var keyBuf, valBuf, transValBuf []byte - for cp.Len() > 0 { - lastKey := common.Copy((*cp)[0].key) - lastVal := common.Copy((*cp)[0].val) - var mergedOnce bool - if a.trace { - if _, ok := a.tracedKeys[string(lastKey)]; ok { - fmt.Printf("looking at key %x val [%x] endBlock %d to merge into [%d-%d]\n", lastKey, lastVal, (*cp)[0].endBlock, startBlock, endBlock) - } - } - // Advance all the items that have this key (including the top) - for cp.Len() > 0 && bytes.Equal((*cp)[0].key, lastKey) { - ci1 := (*cp)[0] - if a.trace { - if _, ok := a.tracedKeys[string(ci1.key)]; ok { - fmt.Printf("skipping same key %x val [%x] endBlock %d to merge into [%d-%d]\n", ci1.key, ci1.val, ci1.endBlock, startBlock, endBlock) - } - } - if ci1.t != FILE_CURSOR { - return nil, 0, fmt.Errorf("mergeIntoStateFile: cursor of unexpected type: %d", ci1.t) - } - if mergedOnce { - //fmt.Printf("mergeIntoStateFile pre-merge prefix [%x], [%x]+[%x]\n", commitment.CompactToHex(lastKey), ci1.val, lastVal) - if lastVal, err = mergeFunc(ci1.val, lastVal, nil); err != nil { - return nil, 0, fmt.Errorf("mergeIntoStateFile: merge values: %w", err) - } - //fmt.Printf("mergeIntoStateFile post-merge prefix [%x], [%x]\n", commitment.CompactToHex(lastKey), lastVal) - } else { - mergedOnce = true - } - if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.Next(ci1.key[:0]) - if valCompressed { - ci1.val, _ = ci1.dg.Next(ci1.val[:0]) - } else { - ci1.val, _ = ci1.dg.NextUncompressed() - } - - heap.Fix(cp, 0) - } else { - heap.Pop(cp) - } - } - var skip bool - switch fType { - case Storage: - // Inside storage files, there is a special item with empty value, and the key equal to the contract's address - // This special item is inserted before the contract storage items, in order to find them using un-ordered index - // (for the purposes of SELF-DESTRUCT and some RPC methods that require enumeration of contract storage) - // We will only skip this special item if there are no more corresponding storage items left - // (this is checked further down with `bytes.HasPrefix(lastKey, keyBuf)`) - skip = startBlock == 0 && len(lastVal) == 0 && len(lastKey) != prefixLen - case Commitment: - // For commitments, the 3rd and 4th bytes of the value (zero-based 2 and 3) contain so-called `afterMap` - // Its bit are set for children that are present in the tree, and unset for those that are not (deleted, for example) - // If all bits are zero (check below), this branch can be skipped, since it is empty - skip = startBlock == 0 && len(lastVal) >= 4 && lastVal[2] == 0 && lastVal[3] == 0 - case AccountHistory, StorageHistory, CodeHistory: - skip = false - default: - // For the rest of types, empty value means deletion - skip = startBlock == 0 && len(lastVal) == 0 - } - if skip { // Deleted marker can be skipped if we merge into the first file, except for the storage addr marker - if _, ok := a.tracedKeys[string(keyBuf)]; ok { - fmt.Printf("skipped key %x for [%d-%d]\n", keyBuf, startBlock, endBlock) - } - } else { - // The check `bytes.HasPrefix(lastKey, keyBuf)` is checking whether the `lastKey` is the first item - // of some contract's storage, and `keyBuf` (the item just before that) is the special item with the - // key being contract's address. If so, the special item (keyBuf => []) needs to be preserved - if keyBuf != nil && (prefixLen == 0 || len(keyBuf) != prefixLen || bytes.HasPrefix(lastKey, keyBuf)) { - if err = comp.AddWord(keyBuf); err != nil { - return nil, 0, err - } - if a.trace { - if _, ok := a.tracedKeys[string(keyBuf)]; ok { - fmt.Printf("merge key %x val [%x] into [%d-%d]\n", keyBuf, valBuf, startBlock, endBlock) - } - } - count++ // Only counting keys, not values - if valTransform != nil { - if transValBuf, err = valTransform(valBuf, transValBuf[:0]); err != nil { - return nil, 0, fmt.Errorf("mergeIntoStateFile -valTransform [%x]: %w", valBuf, err) - } - - if err = comp.AddWord(transValBuf); err != nil { - return nil, 0, err - } - } else if valCompressed { - if err = comp.AddWord(valBuf); err != nil { - return nil, 0, err - } - } else { - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, 0, err - } - } - //if fType == Storage { - // fmt.Printf("merge %s.%d-%d [%x]=>[%x]\n", fType.String(), startBlock, endBlock, keyBuf, valBuf) - //} - } - - keyBuf = append(keyBuf[:0], lastKey...) - valBuf = append(valBuf[:0], lastVal...) - } - } - if keyBuf != nil { - if err = comp.AddWord(keyBuf); err != nil { - return nil, 0, err - } - if a.trace { - if _, ok := a.tracedKeys[string(keyBuf)]; ok { - fmt.Printf("merge key %x val [%x] into [%d-%d]\n", keyBuf, valBuf, startBlock, endBlock) - } - } - count++ // Only counting keys, not values - if valTransform != nil { - if transValBuf, err = valTransform(valBuf, transValBuf[:0]); err != nil { - return nil, 0, fmt.Errorf("mergeIntoStateFile valTransform [%x]: %w", valBuf, err) - } - if err = comp.AddWord(transValBuf); err != nil { - return nil, 0, err - } - } else if valCompressed { - if err = comp.AddWord(valBuf); err != nil { - return nil, 0, err - } - } else { - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, 0, err - } - } - //if fType == Storage { - // fmt.Printf("merge %s.%d-%d [%x]=>[%x]\n", fType.String(), startBlock, endBlock, keyBuf, valBuf) - //} - } - if err = comp.Compress(); err != nil { - return nil, 0, err - } - var d *compress.Decompressor - if d, err = compress.NewDecompressor(datPath); err != nil { - return nil, 0, fmt.Errorf("decompressor: %w", err) - } - return d, count, nil -} - -func (a *Aggregator) stats(fType FileType) (count int, datSize, idxSize int64) { - a.fileLocks[fType].RLock() - defer a.fileLocks[fType].RUnlock() - count = 0 - datSize = 0 - idxSize = 0 - a.files[fType].Ascend(func(i btree.Item) bool { - item := i.(*byEndBlockItem) - if item.decompressor != nil { - count++ - datSize += item.decompressor.Size() - count++ - idxSize += item.index.Size() - } - return true - }) - return -} - -type FilesStats struct { - AccountsCount int - AccountsDatSize int64 - AccountsIdxSize int64 - CodeCount int - CodeDatSize int64 - CodeIdxSize int64 - StorageCount int - StorageDatSize int64 - StorageIdxSize int64 - CommitmentCount int - CommitmentDatSize int64 - CommitmentIdxSize int64 - Hits uint64 - Misses uint64 -} - -func (a *Aggregator) Stats() FilesStats { - var fs FilesStats - fs.AccountsCount, fs.AccountsDatSize, fs.AccountsIdxSize = a.stats(Account) - fs.CodeCount, fs.CodeDatSize, fs.CodeIdxSize = a.stats(Code) - fs.StorageCount, fs.StorageDatSize, fs.StorageIdxSize = a.stats(Storage) - fs.CommitmentCount, fs.CommitmentDatSize, fs.CommitmentIdxSize = a.stats(Commitment) - fs.Hits = atomic.LoadUint64(&a.fileHits) - fs.Misses = atomic.LoadUint64(&a.fileMisses) - return fs -} diff --git a/aggregator/aggregator_test.go b/aggregator/aggregator_test.go deleted file mode 100644 index 6a7f396bf..000000000 --- a/aggregator/aggregator_test.go +++ /dev/null @@ -1,314 +0,0 @@ -/* - Copyright 2022 Erigon contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package aggregator - -import ( - "bytes" - "encoding/binary" - "testing" - - "github.com/holiman/uint256" - - "github.com/ledgerwatch/erigon-lib/commitment" - "github.com/ledgerwatch/erigon-lib/kv/memdb" -) - -func int160(i uint64) []byte { - b := make([]byte, 20) - binary.BigEndian.PutUint64(b[12:], i) - return b -} - -func int256(i uint64) []byte { - b := make([]byte, 32) - binary.BigEndian.PutUint64(b[24:], i) - return b -} - -func accountWithBalance(i uint64) []byte { - balance := uint256.NewInt(i) - var l int - l++ - l++ - if i > 0 { - l += balance.ByteLen() - } - l++ - l++ - value := make([]byte, l) - pos := 0 - value[pos] = 0 - pos++ - if balance.IsZero() { - value[pos] = 0 - pos++ - } else { - balanceBytes := balance.ByteLen() - value[pos] = byte(balanceBytes) - pos++ - balance.WriteToSlice(value[pos : pos+balanceBytes]) - pos += balanceBytes - } - value[pos] = 0 - pos++ - value[pos] = 0 - return value -} - -func TestSimpleAggregator(t *testing.T) { - _, rwTx := memdb.NewTestTx(t) - - tmpDir := t.TempDir() - trie := commitment.InitializeTrie(commitment.VariantHexPatriciaTrie) - a, err := NewAggregator(tmpDir, 16, 4, true, true, 1000, trie, rwTx) - if err != nil { - t.Fatal(err) - } - defer a.Close() - w := a.MakeStateWriter(true /* beforeOn */) - if err = w.Reset(0, rwTx); err != nil { - t.Fatal(err) - } - defer w.Close() - var account1 = accountWithBalance(1) - w.UpdateAccountData(int160(1), account1, false /* trace */) - if err = w.FinishTx(0, false); err != nil { - t.Fatal(err) - } - if err = w.Aggregate(false /* trace */); err != nil { - t.Fatal(err) - } - r := a.MakeStateReader(2, rwTx) - acc, err := r.ReadAccountData(int160(1), false /* trace */) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(acc, account1) { - t.Errorf("read account %x, expected account %x", acc, account1) - } - if err = rwTx.Commit(); err != nil { - t.Fatal(err) - } -} - -func TestLoopAggregator(t *testing.T) { - _, rwTx := memdb.NewTestTx(t) - - tmpDir := t.TempDir() - trie := commitment.InitializeTrie(commitment.VariantHexPatriciaTrie) - a, err := NewAggregator(tmpDir, 16, 4, true, true, 1000, trie, rwTx) - if err != nil { - t.Fatal(err) - } - defer a.Close() - var account1 = accountWithBalance(1) - w := a.MakeStateWriter(true /* beforeOn */) - defer w.Close() - for blockNum := uint64(0); blockNum < 1000; blockNum++ { - accountKey := int160(blockNum/10 + 1) - //fmt.Printf("blockNum = %d\n", blockNum) - if err = w.Reset(blockNum, rwTx); err != nil { - t.Fatal(err) - } - w.UpdateAccountData(accountKey, account1, false /* trace */) - if err = w.FinishTx(blockNum, false /* trace */); err != nil { - t.Fatal(err) - } - if err = w.Aggregate(false /* trace */); err != nil { - t.Fatal(err) - } - r := a.MakeStateReader(blockNum+1, rwTx) - acc, err := r.ReadAccountData(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(acc, account1) { - t.Errorf("read account %x, expected account %x for block %d", acc, account1, blockNum) - } - account1 = accountWithBalance(blockNum + 2) - } - if err = rwTx.Commit(); err != nil { - t.Fatal(err) - } -} - -func TestRecreateAccountWithStorage(t *testing.T) { - _, rwTx := memdb.NewTestTx(t) - tmpDir := t.TempDir() - - trie := commitment.InitializeTrie(commitment.VariantHexPatriciaTrie) - a, err := NewAggregator(tmpDir, 16, 4, true, true, 1000, trie, rwTx) - if err != nil { - t.Fatal(err) - } - defer a.Close() - accountKey := int160(1) - var account1 = accountWithBalance(1) - var account2 = accountWithBalance(2) - w := a.MakeStateWriter(true /* beforeOn */) - defer w.Close() - for blockNum := uint64(0); blockNum < 100; blockNum++ { - if err = w.Reset(blockNum, rwTx); err != nil { - t.Fatal(err) - } - switch blockNum { - case 1: - w.UpdateAccountData(accountKey, account1, false /* trace */) - for s := uint64(0); s < 100; s++ { - w.WriteAccountStorage(accountKey, int256(s), uint256.NewInt(s+1).Bytes(), false /* trace */) - } - case 22: - w.DeleteAccount(accountKey, false /* trace */) - case 45: - w.UpdateAccountData(accountKey, account2, false /* trace */) - for s := uint64(50); s < 150; s++ { - w.WriteAccountStorage(accountKey, int256(s), uint256.NewInt(2*s+1).Bytes(), false /* trace */) - } - } - if err = w.FinishTx(blockNum, false /* trace */); err != nil { - t.Fatal(err) - } - if err = w.Aggregate(false /* trace */); err != nil { - t.Fatal(err) - } - r := a.MakeStateReader(blockNum+1, rwTx) - switch blockNum { - case 1: - acc, err := r.ReadAccountData(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(account1, acc) { - t.Errorf("wrong account after block %d, expected %x, got %x", blockNum, account1, acc) - } - for s := uint64(0); s < 100; s++ { - v, err := r.ReadAccountStorage(accountKey, int256(s), false /* trace */) - if err != nil { - t.Fatal(err) - } - if !uint256.NewInt(s + 1).Eq(uint256.NewInt(0).SetBytes(v)) { - t.Errorf("wrong storage value after block %d, expected %d, got %d", blockNum, s+1, uint256.NewInt(0).SetBytes(v)) - } - } - case 22, 44: - acc, err := r.ReadAccountData(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if len(acc) > 0 { - t.Errorf("wrong account after block %d, expected nil, got %x", blockNum, acc) - } - for s := uint64(0); s < 100; s++ { - v, err := r.ReadAccountStorage(accountKey, int256(s), false /* trace */) - if err != nil { - t.Fatal(err) - } - if v != nil { - t.Errorf("wrong storage value after block %d, expected nil, got %d", blockNum, uint256.NewInt(0).SetBytes(v)) - } - } - case 66: - acc, err := r.ReadAccountData(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(account2, acc) { - t.Errorf("wrong account after block %d, expected %x, got %x", blockNum, account1, acc) - } - for s := uint64(0); s < 150; s++ { - v, err := r.ReadAccountStorage(accountKey, int256(s), false /* trace */) - if err != nil { - t.Fatal(err) - } - if s < 50 { - if v != nil { - t.Errorf("wrong storage value after block %d, expected nil, got %d", blockNum, uint256.NewInt(0).SetBytes(v)) - } - } else if v == nil || !uint256.NewInt(2*s+1).Eq(uint256.NewInt(0).SetBytes(v)) { - t.Errorf("wrong storage value after block %d, expected %d, got %d", blockNum, 2*s+1, uint256.NewInt(0).SetBytes(v)) - } - } - } - } - if err = rwTx.Commit(); err != nil { - t.Fatal(err) - } -} - -func TestChangeCode(t *testing.T) { - _, rwTx := memdb.NewTestTx(t) - - tmpDir := t.TempDir() - trie := commitment.InitializeTrie(commitment.VariantHexPatriciaTrie) - a, err := NewAggregator(tmpDir, 16, 4, true, true, 1000, trie, rwTx) - if err != nil { - t.Fatal(err) - } - defer a.Close() - accountKey := int160(1) - var account1 = accountWithBalance(1) - var code1 = []byte("This is the code number 1") - w := a.MakeStateWriter(true /* beforeOn */) - defer w.Close() - for blockNum := uint64(0); blockNum < 100; blockNum++ { - if err = w.Reset(blockNum, rwTx); err != nil { - t.Fatal(err) - } - switch blockNum { - case 1: - w.UpdateAccountData(accountKey, account1, false /* trace */) - w.UpdateAccountCode(accountKey, code1, false /* trace */) - case 25: - w.DeleteAccount(accountKey, false /* trace */) - } - if err = w.FinishTx(blockNum, false /* trace */); err != nil { - t.Fatal(err) - } - if err = w.Aggregate(false /* trace */); err != nil { - t.Fatal(err) - } - r := a.MakeStateReader(blockNum+1, rwTx) - switch blockNum { - case 22: - acc, err := r.ReadAccountData(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(account1, acc) { - t.Errorf("wrong account after block %d, expected %x, got %x", blockNum, account1, acc) - } - code, err := r.ReadAccountCode(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(code1, code) { - t.Errorf("wrong code after block %d, expected %x, got %x", blockNum, code1, code) - } - case 47: - code, err := r.ReadAccountCode(accountKey, false /* trace */) - if err != nil { - t.Fatal(err) - } - if code != nil { - t.Errorf("wrong code after block %d, expected nil, got %x", blockNum, code) - } - } - } - if err = rwTx.Commit(); err != nil { - t.Fatal(err) - } -} diff --git a/aggregator/history.go b/aggregator/history.go deleted file mode 100644 index b45c83462..000000000 --- a/aggregator/history.go +++ /dev/null @@ -1,354 +0,0 @@ -/* - Copyright 2022 Erigon contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package aggregator - -import ( - "encoding/binary" - "fmt" - "io/fs" - "os" - "path" - "regexp" - "strconv" - "strings" - - "github.com/google/btree" - "github.com/holiman/uint256" - "github.com/ledgerwatch/erigon-lib/compress" - "github.com/ledgerwatch/erigon-lib/recsplit" - "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" - "github.com/ledgerwatch/log/v3" -) - -// History is a utility class that allows reading history of state -// from state files, history files, and bitmap files produced by an Aggregator -type History struct { - files [NumberOfTypes]*btree.BTreeG[*byEndBlockItem] - diffDir string // Directory where the state diff files are stored - aggregationStep uint64 -} - -func NewHistory(diffDir string, blockTo uint64, aggregationStep uint64) (*History, error) { - h := &History{ - diffDir: diffDir, - aggregationStep: aggregationStep, - } - for fType := FirstType; fType < NumberOfTypes; fType++ { - h.files[fType] = btree.NewG(32, ByEndBlockItemLess) - } - var closeStateFiles = true // It will be set to false in case of success at the end of the function - defer func() { - // Clean up all decompressor and indices upon error - if closeStateFiles { - h.Close() - } - }() - // Scan the diff directory and create the mapping of end blocks to files - files, err := os.ReadDir(diffDir) - if err != nil { - return nil, err - } - h.scanStateFiles(files, blockTo) - for fType := FirstType; fType < NumberOfTypes; fType++ { - if err := h.openFiles(fType); err != nil { - return nil, fmt.Errorf("opening %s state files: %w", fType.String(), err) - } - } - closeStateFiles = false - return h, nil -} - -func (h *History) scanStateFiles(files []fs.DirEntry, blockTo uint64) { - typeStrings := make([]string, NumberOfTypes) - for fType := FileType(0); fType < NumberOfTypes; fType++ { - typeStrings[fType] = fType.String() - } - re := regexp.MustCompile("^(" + strings.Join(typeStrings, "|") + ").([0-9]+)-([0-9]+).(dat|idx)$") - var err error - for _, f := range files { - name := f.Name() - subs := re.FindStringSubmatch(name) - if len(subs) != 5 { - if len(subs) != 0 { - log.Warn("File ignored by history, more than 4 submatches", "name", name, "submatches", len(subs)) - } - continue - } - var startBlock, endBlock uint64 - if startBlock, err = strconv.ParseUint(subs[2], 10, 64); err != nil { - log.Warn("File ignored by history, parsing startBlock", "error", err, "name", name) - continue - } - if endBlock, err = strconv.ParseUint(subs[3], 10, 64); err != nil { - log.Warn("File ignored by history, parsing endBlock", "error", err, "name", name) - continue - } - if startBlock > endBlock { - log.Warn("File ignored by history, startBlock > endBlock", "name", name) - continue - } - if endBlock > blockTo { - // Only load files up to specified block - continue - } - fType, ok := ParseFileType(subs[1]) - if !ok { - log.Warn("File ignored by history, type unknown", "type", subs[1]) - } - var item = &byEndBlockItem{startBlock: startBlock, endBlock: endBlock} - var foundI *byEndBlockItem - h.files[fType].AscendGreaterOrEqual(&byEndBlockItem{startBlock: endBlock, endBlock: endBlock}, func(it *byEndBlockItem) bool { - if it.endBlock == endBlock { - foundI = it - } - return false - }) - if foundI == nil || foundI.startBlock > startBlock { - h.files[fType].ReplaceOrInsert(item) - log.Info("Load file", "name", name, "type", fType.String(), "endBlock", item.endBlock) - } - } -} - -func (h *History) openFiles(fType FileType) error { - var err error - h.files[fType].Ascend(func(item *byEndBlockItem) bool { - if item.decompressor, err = compress.NewDecompressor(path.Join(h.diffDir, fmt.Sprintf("%s.%d-%d.dat", fType.String(), item.startBlock, item.endBlock))); err != nil { - return false - } - if item.index, err = recsplit.OpenIndex(path.Join(h.diffDir, fmt.Sprintf("%s.%d-%d.idx", fType.String(), item.startBlock, item.endBlock))); err != nil { - return false - } - item.getter = item.decompressor.MakeGetter() - item.getterMerge = item.decompressor.MakeGetter() - item.indexReader = recsplit.NewIndexReader(item.index) - item.readerMerge = recsplit.NewIndexReader(item.index) - return true - }) - return err -} - -func (h *History) closeFiles(fType FileType) { - h.files[fType].Ascend(func(item *byEndBlockItem) bool { - if item.decompressor != nil { - item.decompressor.Close() - } - if item.index != nil { - item.index.Close() - } - return true - }) -} - -func (h *History) Close() { - // Closing state files only after background aggregation goroutine is finished - for fType := FirstType; fType < NumberOfTypes; fType++ { - h.closeFiles(fType) - } -} - -func (h *History) MakeHistoryReader() *HistoryReader { - r := &HistoryReader{ - h: h, - } - return r -} - -type HistoryReader struct { - h *History - search byEndBlockItem - blockNum uint64 - txNum uint64 - lastTx bool // Whether it is the last transaction in the block -} - -func (hr *HistoryReader) SetNums(blockNum, txNum uint64, lastTx bool) { - hr.blockNum = blockNum - hr.txNum = txNum - hr.lastTx = lastTx -} - -func (hr *HistoryReader) searchInHistory(bitmapType, historyType FileType, key []byte, trace bool) (bool, []byte, error) { - if trace { - fmt.Printf("searchInHistory %s %s [%x] blockNum %d, txNum %d\n", bitmapType.String(), historyType.String(), key, hr.blockNum, hr.txNum) - } - searchBlock := hr.blockNum - if hr.lastTx { - searchBlock++ - } - searchTx := hr.txNum - hr.search.endBlock = searchBlock - hr.search.startBlock = searchBlock - (searchBlock % 500_000) - var eliasVal []byte - var err error - var found bool - var foundTxNum uint64 - var foundEndBlock uint64 - hr.h.files[bitmapType].AscendGreaterOrEqual(&hr.search, func(item *byEndBlockItem) bool { - offset := item.indexReader.Lookup(key) - g := item.getter - g.Reset(offset) - if keyMatch, _ := g.Match(key); keyMatch { - if trace { - fmt.Printf("Found bitmap for [%x] in %s.[%d-%d]\n", key, bitmapType.String(), item.startBlock, item.endBlock) - } - eliasVal, _ = g.NextUncompressed() - ef, _ := eliasfano32.ReadEliasFano(eliasVal) - it := ef.Iterator() - if trace { - for it.HasNext() { - v, _ := it.Next() - fmt.Printf(" %d", v) - } - fmt.Printf("\n") - } - foundTxNum, found = ef.Search(searchTx) - if found { - foundEndBlock = item.endBlock - return false - } - } - // Not found, next - return true - }) - if err != nil { - return false, nil, err - } - if !found { - return false, nil, nil - } - if trace { - fmt.Printf("found in tx %d, endBlock %d\n", foundTxNum, foundEndBlock) - } - var lookupKey = make([]byte, len(key)+8) - binary.BigEndian.PutUint64(lookupKey, foundTxNum) - copy(lookupKey[8:], key) - var historyItem *byEndBlockItem - hr.search.endBlock = foundEndBlock - hr.search.startBlock = foundEndBlock - 499_999 - var ok bool - historyItem, ok = hr.h.files[historyType].Get(&hr.search) - if !ok || historyItem == nil { - return false, nil, fmt.Errorf("no %s file found for %d", historyType.String(), foundEndBlock) - } - offset := historyItem.indexReader.Lookup(lookupKey) - if trace { - fmt.Printf("Lookup [%x] in %s.[%d-%d].idx = %d\n", lookupKey, historyType.String(), historyItem.startBlock, historyItem.endBlock, offset) - } - historyItem.getter.Reset(offset) - v, _ := historyItem.getter.Next(nil) - return true, v, nil -} - -func (hr *HistoryReader) ReadAccountData(addr []byte, trace bool) ([]byte, error) { - // Look in the history first - hOk, v, err := hr.searchInHistory(AccountBitmap, AccountHistory, addr, trace) - if err != nil { - return nil, err - } - if hOk { - if trace { - fmt.Printf("ReadAccountData %x, found in history [%x]\n", addr, v) - } - return v, nil - } - if trace { - fmt.Printf("ReadAccountData %x, not found in history, get from the state\n", addr) - } - // Not found in history - look in the state files - return hr.h.readFromFiles(Account, addr, trace), nil -} - -func (hr *HistoryReader) ReadAccountStorage(addr []byte, loc []byte, trace bool) (*uint256.Int, error) { - // Look in the history first - dbkey := make([]byte, len(addr)+len(loc)) - copy(dbkey[0:], addr) - copy(dbkey[len(addr):], loc) - hOk, v, err := hr.searchInHistory(StorageBitmap, StorageHistory, dbkey, trace) - if err != nil { - return nil, err - } - if hOk { - return new(uint256.Int).SetBytes(v), nil - } - // Not found in history, look in the state files - v = hr.h.readFromFiles(Storage, dbkey, trace) - if v != nil { - return new(uint256.Int).SetBytes(v), nil - } - return nil, nil -} - -func (hr *HistoryReader) ReadAccountCode(addr []byte, trace bool) ([]byte, error) { - // Look in the history first - hOk, v, err := hr.searchInHistory(CodeBitmap, CodeHistory, addr, false) - if err != nil { - return nil, err - } - if hOk { - return v, err - } - // Not found in history, look in the history files - return hr.h.readFromFiles(Code, addr, trace), nil -} - -func (hr *HistoryReader) ReadAccountCodeSize(addr []byte, trace bool) (int, error) { - // Look in the history first - hOk, v, err := hr.searchInHistory(CodeBitmap, CodeHistory, addr, false) - if err != nil { - return 0, err - } - if hOk { - return len(v), err - } - // Not found in history, look in the history files - return len(hr.h.readFromFiles(Code, addr, trace)), nil -} - -func (h *History) readFromFiles(fType FileType, filekey []byte, trace bool) []byte { - var val []byte - h.files[fType].Descend(func(item *byEndBlockItem) bool { - if trace { - fmt.Printf("read %s %x: search in file [%d-%d]\n", fType.String(), filekey, item.startBlock, item.endBlock) - } - if item.tree != nil { - ai, ok := item.tree.Get(&AggregateItem{k: filekey}) - if !ok || ai == nil { - return true - } - val = ai.v - return false - } - if item.index.Empty() { - return true - } - offset := item.indexReader.Lookup(filekey) - g := item.getter - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(filekey); keyMatch { - val, _ = g.Next(nil) - if trace { - fmt.Printf("read %s %x: found [%x] in file [%d-%d]\n", fType.String(), filekey, val, item.startBlock, item.endBlock) - } - return false - } - } - return true - }) - return val -} diff --git a/compress/decompress_test.go b/compress/decompress_test.go index 9d903abd8..209ad9992 100644 --- a/compress/decompress_test.go +++ b/compress/decompress_test.go @@ -98,6 +98,57 @@ func TestDecompressMatchOK(t *testing.T) { } } +func prepareStupidDict(t *testing.T, size int) *Decompressor { + t.Helper() + tmpDir := t.TempDir() + file := filepath.Join(tmpDir, "compressed2") + t.Name() + c, err := NewCompressor(context.Background(), t.Name(), file, tmpDir, 1, 2, log.LvlDebug) + if err != nil { + t.Fatal(err) + } + defer c.Close() + for i := 0; i < size; i++ { + if err = c.AddWord([]byte(fmt.Sprintf("word-%d", i))); err != nil { + t.Fatal(err) + } + } + if err = c.Compress(); err != nil { + t.Fatal(err) + } + var d *Decompressor + if d, err = NewDecompressor(file); err != nil { + t.Fatal(err) + } + return d +} + +func TestDecompressMatchOKCondensed(t *testing.T) { + condensePatternTableBitThreshold = 4 + d := prepareStupidDict(t, 10000) + defer func() { condensePatternTableBitThreshold = 9 }() + defer d.Close() + + g := d.MakeGetter() + i := 0 + for g.HasNext() { + if i%2 != 0 { + expected := fmt.Sprintf("word-%d", i) + ok, _ := g.Match([]byte(expected)) + if !ok { + t.Errorf("expexted match with %s", expected) + } + } else { + word, _ := g.Next(nil) + expected := fmt.Sprintf("word-%d", i) + if string(word) != expected { + t.Errorf("expected %s, got (hex) %s", expected, word) + } + } + i++ + } +} + func TestDecompressMatchNotOK(t *testing.T) { d := prepareLoremDict(t) defer d.Close() diff --git a/go.mod b/go.mod index 68fe6a847..7302fa421 100644 --- a/go.mod +++ b/go.mod @@ -95,7 +95,6 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rs/dnscache v0.0.0-20211102005908-e0241e321417 // indirect - github.com/stretchr/objx v0.5.0 // indirect github.com/valyala/fastrand v1.1.0 // indirect github.com/valyala/histogram v1.2.0 // indirect go.etcd.io/bbolt v1.3.6 // indirect diff --git a/go.sum b/go.sum index 1ae6a2b0d..f011bf50c 100644 --- a/go.sum +++ b/go.sum @@ -356,7 +356,6 @@ github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.1/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= @@ -367,7 +366,6 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= @@ -413,8 +411,6 @@ golang.org/x/crypto v0.5.0/go.mod h1:NK/OQwhpMQP3MwtdjgLlYHnH9ebylxKWv3e0fK+mkQU golang.org/x/crypto v0.6.0 h1:qfktjS5LUO+fFKeJXZ+ikTRijMmljikvG68fpMMruSc= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20230213192124-5e25df0256eb h1:PaBZQdo+iSDyHT053FjUCgZQ/9uqVwPOcl7KSWhKn6w= -golang.org/x/exp v0.0.0-20230213192124-5e25df0256eb/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 h1:Jvc7gsqn21cJHCmAWx0LiimpP18LZmUxkT5Mp7EZ1mI= golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= diff --git a/state/aggregator.go b/state/aggregator.go index 395fdab47..28951ef76 100644 --- a/state/aggregator.go +++ b/state/aggregator.go @@ -28,9 +28,10 @@ import ( "time" "github.com/holiman/uint256" - "github.com/ledgerwatch/erigon-lib/kv/order" "github.com/ledgerwatch/log/v3" + "github.com/ledgerwatch/erigon-lib/kv/order" + "github.com/ledgerwatch/erigon-lib/commitment" "github.com/ledgerwatch/erigon-lib/common/length" "github.com/ledgerwatch/erigon-lib/kv" @@ -40,8 +41,6 @@ import ( // files of smaller size are also immutable, but can be removed after merge to bigger files. const StepsInBiggestFile = 32 -// Reconstruction of the aggregator in another package, `aggregator` - type Aggregator struct { aggregationStep uint64 accounts *Domain @@ -55,19 +54,15 @@ type Aggregator struct { txNum uint64 seekTxNum uint64 blockNum uint64 - commitFn func(txNum uint64) error + stepDoneNotice chan [length.Hash]byte rwTx kv.RwTx stats FilesStats tmpdir string defaultCtx *AggregatorContext } -func NewAggregator( - dir, tmpdir string, - aggregationStep uint64, -) (*Aggregator, error) { - - a := &Aggregator{aggregationStep: aggregationStep, tmpdir: tmpdir} +func NewAggregator(dir, tmpdir string, aggregationStep uint64, commitmentMode CommitmentMode, commitTrieVariant commitment.TrieVariant) (*Aggregator, error) { + a := &Aggregator{aggregationStep: aggregationStep, tmpdir: tmpdir, stepDoneNotice: make(chan [length.Hash]byte, 1)} closeAgg := true defer func() { @@ -79,21 +74,21 @@ func NewAggregator( if err != nil { return nil, err } - if a.accounts, err = NewDomain(dir, tmpdir, aggregationStep, "accounts", kv.AccountKeys, kv.AccountVals, kv.AccountHistoryKeys, kv.AccountHistoryVals, kv.AccountSettings, kv.AccountIdx, 0 /* prefixLen */, false /* compressVals */); err != nil { + if a.accounts, err = NewDomain(dir, tmpdir, aggregationStep, "accounts", kv.AccountKeys, kv.AccountVals, kv.AccountHistoryKeys, kv.AccountHistoryVals, kv.AccountSettings, kv.AccountIdx, false /* compressVals */); err != nil { return nil, err } - if a.storage, err = NewDomain(dir, tmpdir, aggregationStep, "storage", kv.StorageKeys, kv.StorageVals, kv.StorageHistoryKeys, kv.StorageHistoryVals, kv.StorageSettings, kv.StorageIdx, 20 /* prefixLen */, false /* compressVals */); err != nil { + if a.storage, err = NewDomain(dir, tmpdir, aggregationStep, "storage", kv.StorageKeys, kv.StorageVals, kv.StorageHistoryKeys, kv.StorageHistoryVals, kv.StorageSettings, kv.StorageIdx, false /* compressVals */); err != nil { return nil, err } - if a.code, err = NewDomain(dir, tmpdir, aggregationStep, "code", kv.CodeKeys, kv.CodeVals, kv.CodeHistoryKeys, kv.CodeHistoryVals, kv.CodeSettings, kv.CodeIdx, 0 /* prefixLen */, true /* compressVals */); err != nil { + if a.code, err = NewDomain(dir, tmpdir, aggregationStep, "code", kv.CodeKeys, kv.CodeVals, kv.CodeHistoryKeys, kv.CodeHistoryVals, kv.CodeSettings, kv.CodeIdx, true /* compressVals */); err != nil { return nil, err } - commitd, err := NewDomain(dir, tmpdir, aggregationStep, "commitment", kv.CommitmentKeys, kv.CommitmentVals, kv.CommitmentHistoryKeys, kv.CommitmentHistoryVals, kv.CommitmentSettings, kv.CommitmentIdx, 0 /* prefixLen */, false /* compressVals */) + commitd, err := NewDomain(dir, tmpdir, aggregationStep, "commitment", kv.CommitmentKeys, kv.CommitmentVals, kv.CommitmentHistoryKeys, kv.CommitmentHistoryVals, kv.CommitmentSettings, kv.CommitmentIdx, false /* compressVals */) if err != nil { return nil, err } - a.commitment = NewCommittedDomain(commitd, CommitmentModeDirect) + a.commitment = NewCommittedDomain(commitd, commitmentMode, commitTrieVariant) if a.logAddrs, err = NewInvertedIndex(dir, tmpdir, aggregationStep, "logaddrs", kv.LogAddressKeys, kv.LogAddressIdx, false, nil); err != nil { return nil, err @@ -109,8 +104,7 @@ func NewAggregator( } closeAgg = false - a.defaultCtx = a.MakeContext() - a.commitment.patriciaTrie.ResetFns(a.defaultCtx.branchFn, a.defaultCtx.accountFn, a.defaultCtx.storageFn) + //a.defaultCtx = a.MakeContext() return a, nil } @@ -193,6 +187,12 @@ func (a *Aggregator) GetAndResetStats() DomainStats { } func (a *Aggregator) Close() { + if a.defaultCtx != nil { + a.defaultCtx.Close() + } + if a.stepDoneNotice != nil { + close(a.stepDoneNotice) + } if a.accounts != nil { a.accounts.Close() } @@ -244,9 +244,6 @@ func (a *Aggregator) SetTxNum(txNum uint64) { a.tracesTo.SetTxNum(txNum) } -// todo useless -func (a *Aggregator) SetBlockNum(bn uint64) { a.blockNum = bn } - func (a *Aggregator) SetWorkers(i int) { a.accounts.compressWorkers = i a.storage.compressWorkers = i @@ -302,25 +299,24 @@ func (a *Aggregator) SeekCommitment() (txNum uint64, err error) { } func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { - defer func(t time.Time) { - log.Info("[snapshots] aggregation step is done", "step", step, "took", time.Since(t)) - }(time.Now()) - var ( logEvery = time.NewTicker(time.Second * 30) wg sync.WaitGroup errCh = make(chan error, 8) - //maxSpan = StepsInBiggestFile * a.aggregationStep - txFrom = step * a.aggregationStep - txTo = (step + 1) * a.aggregationStep - //workers = 1 + maxSpan = StepsInBiggestFile * a.aggregationStep + txFrom = step * a.aggregationStep + txTo = (step + 1) * a.aggregationStep + workers = 1 + + stepStartedAt = time.Now() ) + defer logEvery.Stop() - for _, d := range []*Domain{a.accounts, a.storage, a.code, a.commitment.Domain} { + for i, d := range []*Domain{a.accounts, a.storage, a.code, a.commitment.Domain} { wg.Add(1) - collation, err := d.collate(ctx, step, txFrom, txTo, d.tx, logEvery) + collation, err := d.collateStream(ctx, step, txFrom, txTo, d.tx, logEvery) if err != nil { collation.Close() return fmt.Errorf("domain collation %q has failed: %w", d.filenameBase, err) @@ -329,10 +325,7 @@ func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { go func(wg *sync.WaitGroup, d *Domain, collation Collation) { defer wg.Done() - defer func(t time.Time) { - log.Info("[snapshots] domain collate-build is done", "took", time.Since(t), "domain", d.filenameBase) - }(time.Now()) - + start := time.Now() sf, err := d.buildFiles(ctx, step, collation) collation.Close() if err != nil { @@ -342,13 +335,20 @@ func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { } d.integrateFiles(sf, step*a.aggregationStep, (step+1)*a.aggregationStep) + d.stats.LastFileBuildingTook = time.Since(start) }(&wg, d, collation) + if i != 3 { // do not warmup commitment domain + if err := d.warmup(ctx, txFrom, d.aggregationStep/10, d.tx); err != nil { + return fmt.Errorf("warmup %q domain failed: %w", d.filenameBase, err) + } + } if err := d.prune(ctx, step, txFrom, txTo, math.MaxUint64, logEvery); err != nil { return err } } + // indices are built concurrently for _, d := range []*InvertedIndex{a.logTopics, a.logAddrs, a.tracesFrom, a.tracesTo} { wg.Add(1) @@ -359,9 +359,6 @@ func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { go func(wg *sync.WaitGroup, d *InvertedIndex, tx kv.Tx) { defer wg.Done() - defer func(t time.Time) { - log.Info("[snapshots] index collate-build is done", "took", time.Since(t), "domain", d.filenameBase) - }(time.Now()) sf, err := d.buildFiles(ctx, step, collation) if err != nil { @@ -370,6 +367,15 @@ func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { return } d.integrateFiles(sf, step*a.aggregationStep, (step+1)*a.aggregationStep) + + icx := d.MakeContext() + defer icx.Close() + + mm := d.endTxNumMinimax() + if err := d.mergeRangesUpTo(ctx, mm, maxSpan, workers, icx); err != nil { + errCh <- err + return + } }(&wg, d, d.tx) if err := d.prune(ctx, txFrom, txTo, math.MaxUint64, logEvery); err != nil { @@ -387,11 +393,48 @@ func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { return fmt.Errorf("domain collate-build failed: %w", err) } - ac := a.MakeContext() - defer ac.Close() + var clo, chi, plo, phi, blo, bhi time.Duration + clo, plo, blo = time.Hour*99, time.Hour*99, time.Hour*99 + for _, s := range []DomainStats{a.accounts.stats, a.code.stats, a.storage.stats} { + c := s.LastCollationTook + p := s.LastPruneTook + b := s.LastFileBuildingTook + + if c < clo { + clo = c + } + if c > chi { + chi = c + } + if p < plo { + plo = p + } + if p > phi { + phi = p + } + if b < blo { + blo = b + } + if b > bhi { + bhi = b + } + } + + stepTook := time.Since(stepStartedAt) + log.Info("[stat] finished aggregation, ready for mergeUpTo", + "range", fmt.Sprintf("%.2fM-%.2fM", float64(txFrom)/10e5, float64(txTo)/10e5), + "step_took", stepTook, + "collate_min", clo, "collate_max", chi, + "prune_min", plo, "prune_max", phi, + "files_build_min", blo, "files_build_max", bhi) + mergeStartedAt := time.Now() maxEndTxNum := a.EndTxNumMinimax() + + var upmerges int for { + a.defaultCtx.Close() + a.defaultCtx = a.MakeContext() somethingMerged, err := a.mergeLoopStep(ctx, maxEndTxNum, 1) if err != nil { return err @@ -399,22 +442,29 @@ func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { if !somethingMerged { break } + upmerges++ } + + log.Info("[stat] aggregation merged", + "upto_tx", maxEndTxNum, + "aggregation_took", time.Since(stepStartedAt), + "step_took", stepTook, + "merge_took", time.Since(mergeStartedAt), + "merges_count", upmerges) return nil } func (a *Aggregator) mergeLoopStep(ctx context.Context, maxEndTxNum uint64, workers int) (somethingDone bool, err error) { closeAll := true + mergeStartedAt := time.Now() + maxSpan := a.aggregationStep * StepsInBiggestFile r := a.findMergeRange(maxEndTxNum, maxSpan) if !r.any() { return false, nil } - ac := a.MakeContext() // this need, to ensure we do all operations on files in "transaction-style", maybe we will ensure it on type-level in future - defer ac.Close() - - outs := a.staticFilesInRange(r, ac) + outs := a.staticFilesInRange(r, a.defaultCtx) defer func() { if closeAll { outs.Close() @@ -433,30 +483,39 @@ func (a *Aggregator) mergeLoopStep(ctx context.Context, maxEndTxNum uint64, work a.integrateMergedFiles(outs, in) a.cleanAfterFreeze(in) closeAll = false + + var blo, bhi time.Duration + blo = time.Hour * 99 + for _, s := range []DomainStats{a.accounts.stats, a.code.stats, a.storage.stats} { + b := s.LastFileBuildingTook + if b < blo { + blo = b + } + if b > bhi { + bhi = b + } + } + + log.Info("[stat] finished merge step", + "upto_tx", maxEndTxNum, "merge_step_took", time.Since(mergeStartedAt), + "merge_min", blo, "merge_max", bhi) + return true, nil } type Ranges struct { - accounts DomainRanges - storage DomainRanges - code DomainRanges - commitment DomainRanges - logTopicsEndTxNum uint64 - logAddrsEndTxNum uint64 - logTopicsStartTxNum uint64 - logAddrsStartTxNum uint64 - tracesFromStartTxNum uint64 - tracesFromEndTxNum uint64 - tracesToStartTxNum uint64 - tracesToEndTxNum uint64 - logAddrs bool - logTopics bool - tracesFrom bool - tracesTo bool + accounts DomainRanges + storage DomainRanges + code DomainRanges + commitment DomainRanges +} + +func (r Ranges) String() string { + return fmt.Sprintf("accounts=%s, storage=%s, code=%s, commitment=%s", r.accounts.String(), r.storage.String(), r.code.String(), r.commitment.String()) } func (r Ranges) any() bool { - return r.accounts.any() || r.storage.any() || r.code.any() || r.commitment.any() //|| r.logAddrs || r.logTopics || r.tracesFrom || r.tracesTo + return r.accounts.any() || r.storage.any() || r.code.any() || r.commitment.any() } func (a *Aggregator) findMergeRange(maxEndTxNum, maxSpan uint64) Ranges { @@ -465,11 +524,7 @@ func (a *Aggregator) findMergeRange(maxEndTxNum, maxSpan uint64) Ranges { r.storage = a.storage.findMergeRange(maxEndTxNum, maxSpan) r.code = a.code.findMergeRange(maxEndTxNum, maxSpan) r.commitment = a.commitment.findMergeRange(maxEndTxNum, maxSpan) - r.logAddrs, r.logAddrsStartTxNum, r.logAddrsEndTxNum = a.logAddrs.findMergeRange(maxEndTxNum, maxSpan) - r.logTopics, r.logTopicsStartTxNum, r.logTopicsEndTxNum = a.logTopics.findMergeRange(maxEndTxNum, maxSpan) - r.tracesFrom, r.tracesFromStartTxNum, r.tracesFromEndTxNum = a.tracesFrom.findMergeRange(maxEndTxNum, maxSpan) - r.tracesTo, r.tracesToStartTxNum, r.tracesToEndTxNum = a.tracesTo.findMergeRange(maxEndTxNum, maxSpan) - //log.Info(fmt.Sprintf("findMergeRange(%d, %d)=%+v\n", maxEndTxNum, maxSpan, r)) + log.Info(fmt.Sprintf("findMergeRange(%d, %d)=%+v\n", maxEndTxNum, maxSpan, r)) return r } @@ -486,18 +541,10 @@ type SelectedStaticFiles struct { commitment []*filesItem commitmentIdx []*filesItem commitmentHist []*filesItem - tracesTo []*filesItem - tracesFrom []*filesItem - logTopics []*filesItem - logAddrs []*filesItem codeI int storageI int accountsI int commitmentI int - logAddrsI int - tracesFromI int - logTopicsI int - tracesToI int } func (sf SelectedStaticFiles) Close() { @@ -506,7 +553,6 @@ func (sf SelectedStaticFiles) Close() { sf.storage, sf.storageIdx, sf.storageHist, sf.code, sf.codeIdx, sf.codeHist, sf.commitment, sf.commitmentIdx, sf.commitmentHist, - //sf.logAddrs, sf.logTopics, sf.tracesFrom, sf.tracesTo, } { for _, item := range group { if item != nil { @@ -516,6 +562,9 @@ func (sf SelectedStaticFiles) Close() { if item.index != nil { item.index.Close() } + if item.bindex != nil { + item.bindex.Close() + } } } } @@ -535,18 +584,6 @@ func (a *Aggregator) staticFilesInRange(r Ranges, ac *AggregatorContext) Selecte if r.commitment.any() { sf.commitment, sf.commitmentIdx, sf.commitmentHist, sf.commitmentI = a.commitment.staticFilesInRange(r.commitment, ac.commitment) } - if r.logAddrs { - sf.logAddrs, sf.logAddrsI = a.logAddrs.staticFilesInRange(r.logAddrsStartTxNum, r.logAddrsEndTxNum, ac.logAddrs) - } - if r.logTopics { - sf.logTopics, sf.logTopicsI = a.logTopics.staticFilesInRange(r.logTopicsStartTxNum, r.logTopicsEndTxNum, ac.logTopics) - } - if r.tracesFrom { - sf.tracesFrom, sf.tracesFromI = a.tracesFrom.staticFilesInRange(r.tracesFromStartTxNum, r.tracesFromEndTxNum, ac.tracesFrom) - } - if r.tracesTo { - sf.tracesTo, sf.tracesToI = a.tracesTo.staticFilesInRange(r.tracesToStartTxNum, r.tracesToEndTxNum, ac.tracesTo) - } return sf } @@ -559,10 +596,6 @@ type MergedFiles struct { codeIdx, codeHist *filesItem commitment *filesItem commitmentIdx, commitmentHist *filesItem - logAddrs *filesItem - logTopics *filesItem - tracesFrom *filesItem - tracesTo *filesItem } func (mf MergedFiles) Close() { @@ -580,12 +613,21 @@ func (mf MergedFiles) Close() { if item.decompressor != nil { item.index.Close() } + if item.bindex != nil { + item.bindex.Close() + } } } } func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, r Ranges, workers int) (MergedFiles, error) { - defer func(t time.Time) { log.Info("[snapshots] merge", "took", time.Since(t)) }(time.Now()) + started := time.Now() + defer func(t time.Time) { + log.Info("[snapshots] domain files has been merged", + "range", fmt.Sprintf("%d-%d", r.accounts.valuesStartTxNum/a.aggregationStep, r.accounts.valuesEndTxNum/a.aggregationStep), + "took", time.Since(t)) + }(started) + var mf MergedFiles closeFiles := true defer func() { @@ -595,15 +637,15 @@ func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, }() var ( - errCh = make(chan error, 8) + errCh = make(chan error, 4) wg sync.WaitGroup predicates sync.WaitGroup ) + wg.Add(4) predicates.Add(2) - wg.Add(8) - go func() { + go func(predicates *sync.WaitGroup) { defer wg.Done() defer predicates.Done() var err error @@ -612,8 +654,8 @@ func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, errCh <- err } } - }() - go func() { + }(&predicates) + go func(predicates *sync.WaitGroup) { defer wg.Done() defer predicates.Done() var err error @@ -622,9 +664,10 @@ func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, errCh <- err } } - }() + }(&predicates) go func() { defer wg.Done() + var err error if r.code.any() { if mf.code, mf.codeIdx, mf.codeHist, err = a.code.mergeFiles(ctx, files.code, files.codeIdx, files.codeHist, r.code, workers); err != nil { @@ -632,44 +675,8 @@ func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, } } }() - go func() { - defer wg.Done() - var err error - if r.logAddrs { - if mf.logAddrs, err = a.logAddrs.mergeFiles(ctx, files.logAddrs, r.logAddrsStartTxNum, r.logAddrsEndTxNum, workers); err != nil { - errCh <- err - } - } - }() - go func() { - defer wg.Done() - var err error - if r.logTopics { - if mf.logTopics, err = a.logTopics.mergeFiles(ctx, files.logTopics, r.logTopicsStartTxNum, r.logTopicsEndTxNum, workers); err != nil { - errCh <- err - } - } - }() - go func() { - defer wg.Done() - var err error - if r.tracesFrom { - if mf.tracesFrom, err = a.tracesFrom.mergeFiles(ctx, files.tracesFrom, r.tracesFromStartTxNum, r.tracesFromEndTxNum, workers); err != nil { - errCh <- err - } - } - }() - go func() { - defer wg.Done() - var err error - if r.tracesTo { - if mf.tracesTo, err = a.tracesTo.mergeFiles(ctx, files.tracesTo, r.tracesToStartTxNum, r.tracesToEndTxNum, workers); err != nil { - errCh <- err - } - } - }() - go func() { + go func(preidcates *sync.WaitGroup) { defer wg.Done() predicates.Wait() @@ -680,11 +687,11 @@ func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, errCh <- err } } - }() + + }(&predicates) go func() { wg.Wait() - close(errCh) }() @@ -703,21 +710,15 @@ func (a *Aggregator) integrateMergedFiles(outs SelectedStaticFiles, in MergedFil a.storage.integrateMergedFiles(outs.storage, outs.storageIdx, outs.storageHist, in.storage, in.storageIdx, in.storageHist) a.code.integrateMergedFiles(outs.code, outs.codeIdx, outs.codeHist, in.code, in.codeIdx, in.codeHist) a.commitment.integrateMergedFiles(outs.commitment, outs.commitmentIdx, outs.commitmentHist, in.commitment, in.commitmentIdx, in.commitmentHist) - a.logAddrs.integrateMergedFiles(outs.logAddrs, in.logAddrs) - a.logTopics.integrateMergedFiles(outs.logTopics, in.logTopics) - a.tracesFrom.integrateMergedFiles(outs.tracesFrom, in.tracesFrom) - a.tracesTo.integrateMergedFiles(outs.tracesTo, in.tracesTo) } + func (a *Aggregator) cleanAfterFreeze(in MergedFiles) { a.accounts.cleanAfterFreeze(in.accountsHist) a.storage.cleanAfterFreeze(in.storageHist) a.code.cleanAfterFreeze(in.codeHist) a.commitment.cleanAfterFreeze(in.commitment) - a.logAddrs.cleanFrozenParts(in.logAddrs) - a.logTopics.cleanFrozenParts(in.logTopics) - a.tracesFrom.cleanFrozenParts(in.tracesFrom) - a.tracesTo.cleanFrozenParts(in.tracesTo) } + func (ac *AggregatorContext) ReadAccountData(addr []byte, roTx kv.Tx) ([]byte, error) { return ac.accounts.Get(addr, nil, roTx) } @@ -885,27 +886,26 @@ func (a *Aggregator) ReadyToFinishTx() bool { return (a.txNum+1)%a.aggregationStep == 0 && a.seekTxNum < a.txNum } -func (a *Aggregator) SetCommitFn(fn func(txNum uint64) error) { - a.commitFn = fn +// Provides channel which receives commitment hash each time aggregation is occured +func (a *Aggregator) AggregatedRoots() chan [length.Hash]byte { + return a.stepDoneNotice } -func (a *Aggregator) FinishTx() error { +func (a *Aggregator) FinishTx() (err error) { atomic.AddUint64(&a.stats.TxCount, 1) if !a.ReadyToFinishTx() { return nil } - _, err := a.ComputeCommitment(true, false) + + a.commitment.patriciaTrie.ResetFns(a.defaultCtx.branchFn, a.defaultCtx.accountFn, a.defaultCtx.storageFn) + rootHash, err := a.ComputeCommitment(true, false) if err != nil { return err } step := a.txNum / a.aggregationStep if step == 0 { - if a.commitFn != nil { - if err := a.commitFn(a.txNum); err != nil { - return fmt.Errorf("aggregator: db commit on finishTx failed, txNum=%d err=%w", a.txNum, err) - } - } + a.notifyAggregated(rootHash) return nil } step-- // Leave one step worth in the DB @@ -918,17 +918,22 @@ func (a *Aggregator) FinishTx() error { return err } - if a.commitFn != nil { - if err := a.commitFn(a.txNum); err != nil { - return err - } - } + a.notifyAggregated(rootHash) - //a.defaultCtx = a.MakeContext() + //a.FinishWrites() + //a.StartWrites() return nil } +func (a *Aggregator) notifyAggregated(rootHash []byte) { + rh := (*[length.Hash]byte)(rootHash[:]) + select { + case a.stepDoneNotice <- *rh: + default: + } +} + func (a *Aggregator) UpdateAccountData(addr []byte, account []byte) error { a.commitment.TouchPlainKey(addr, account, a.commitment.TouchPlainKeyAccount) return a.accounts.Put(addr, nil, account) @@ -1021,6 +1026,22 @@ func (a *Aggregator) StartWrites() *Aggregator { a.logTopics.StartWrites() a.tracesFrom.StartWrites() a.tracesTo.StartWrites() + + if a.defaultCtx != nil { + a.defaultCtx.Close() + } + a.defaultCtx = &AggregatorContext{ + a: a, + accounts: a.accounts.defaultDc, + storage: a.storage.defaultDc, + code: a.code.defaultDc, + commitment: a.commitment.defaultDc, + logAddrs: a.logAddrs.MakeContext(), + logTopics: a.logTopics.MakeContext(), + tracesFrom: a.tracesFrom.MakeContext(), + tracesTo: a.tracesTo.MakeContext(), + } + a.commitment.patriciaTrie.ResetFns(a.defaultCtx.branchFn, a.defaultCtx.accountFn, a.defaultCtx.storageFn) return a } func (a *Aggregator) FinishWrites() { @@ -1032,11 +1053,14 @@ func (a *Aggregator) FinishWrites() { a.logTopics.FinishWrites() a.tracesFrom.FinishWrites() a.tracesTo.FinishWrites() + //if a.defaultCtx != nil { + // a.defaultCtx.Close() + // a.defaultCtx = nil + //} } // Flush - must be called before Collate, if you did some writes func (a *Aggregator) Flush(ctx context.Context) error { - // TODO: Add support of commitment! flushers := []flusher{ a.accounts.Rotate(), a.storage.Rotate(), @@ -1057,10 +1081,13 @@ func (a *Aggregator) Flush(ctx context.Context) error { } type FilesStats struct { - TxCount uint64 - FilesCount uint64 - IdxSize uint64 - DataSize uint64 + HistoryReads uint64 + TotalReads uint64 + IdxAccess time.Duration + TxCount uint64 + FilesCount uint64 + IdxSize uint64 + DataSize uint64 } func (a *Aggregator) Stats() FilesStats { @@ -1069,6 +1096,9 @@ func (a *Aggregator) Stats() FilesStats { res.IdxSize = stat.IndexSize res.DataSize = stat.DataSize res.FilesCount = stat.FilesCount + res.HistoryReads = stat.HistoryQueries + res.TotalReads = stat.TotalQueries + res.IdxAccess = stat.EfSearchTime return res } @@ -1086,7 +1116,7 @@ type AggregatorContext struct { } func (a *Aggregator) MakeContext() *AggregatorContext { - return &AggregatorContext{ + ac := &AggregatorContext{ a: a, accounts: a.accounts.MakeContext(), storage: a.storage.MakeContext(), @@ -1097,7 +1127,9 @@ func (a *Aggregator) MakeContext() *AggregatorContext { tracesFrom: a.tracesFrom.MakeContext(), tracesTo: a.tracesTo.MakeContext(), } + return ac } + func (ac *AggregatorContext) Close() { ac.accounts.Close() ac.storage.Close() diff --git a/state/aggregator_bench_test.go b/state/aggregator_bench_test.go index 830392cd3..7fa0198da 100644 --- a/state/aggregator_bench_test.go +++ b/state/aggregator_bench_test.go @@ -2,19 +2,24 @@ package state import ( "context" + "fmt" "math/rand" "os" + "path" + "path/filepath" "testing" + "time" "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/require" + "github.com/ledgerwatch/erigon-lib/commitment" "github.com/ledgerwatch/erigon-lib/common/length" "github.com/ledgerwatch/erigon-lib/kv" "github.com/ledgerwatch/erigon-lib/kv/mdbx" ) -func testDbAndAggregatorBench(b *testing.B, prefixLen int, aggStep uint64) (string, kv.RwDB, *Aggregator) { +func testDbAndAggregatorBench(b *testing.B, aggStep uint64) (string, kv.RwDB, *Aggregator) { b.Helper() path := b.TempDir() b.Cleanup(func() { os.RemoveAll(path) }) @@ -23,7 +28,7 @@ func testDbAndAggregatorBench(b *testing.B, prefixLen int, aggStep uint64) (stri return kv.ChaindataTablesCfg }).MustOpen() b.Cleanup(db.Close) - agg, err := NewAggregator(path, path, aggStep) + agg, err := NewAggregator(path, path, aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie) require.NoError(b, err) b.Cleanup(agg.Close) return path, db, agg @@ -33,12 +38,11 @@ func BenchmarkAggregator_Processing(b *testing.B) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - //keys := queueKeys(ctx, 42, length.Addr) longKeys := queueKeys(ctx, 64, length.Addr+length.Hash) vals := queueKeys(ctx, 53, length.Hash) - aggStep := uint64(100_000) - _, db, agg := testDbAndAggregatorBench(b, length.Addr, aggStep) + aggStep := uint64(100_00) + _, db, agg := testDbAndAggregatorBench(b, aggStep) tx, err := db.BeginRw(ctx) require.NoError(b, err) @@ -46,49 +50,20 @@ func BenchmarkAggregator_Processing(b *testing.B) { if tx != nil { tx.Rollback() } - if agg != nil { - agg.Close() - } }() - commit := func(txN uint64) (err error) { - err = tx.Commit() - require.NoError(b, err) - if err != nil { - return err - } - - tx = nil - tx, err = db.BeginRw(ctx) - require.NoError(b, err) - if err != nil { - return err - } - agg.SetTx(tx) - return nil - } - agg.SetCommitFn(commit) agg.SetTx(tx) defer agg.StartWrites().FinishWrites() require.NoError(b, err) - agg.StartWrites() - defer agg.FinishWrites() b.ReportAllocs() b.ResetTimer() - //keyList := make([][]byte, 20000) + for i := 0; i < b.N; i++ { - //var key []byte - //if i >= len(keyList) { - // pi := i % (len(keyList)) - // key = keyList[pi] - //} else { - // key = <-longKeys - // keyList[i] = key - //} key := <-longKeys val := <-vals - agg.SetTxNum(uint64(i)) + txNum := uint64(i) + agg.SetTxNum(txNum) err := agg.WriteAccountStorage(key[:length.Addr], key[length.Addr:], val) require.NoError(b, err) err = agg.FinishTx() @@ -113,3 +88,54 @@ func queueKeys(ctx context.Context, seed, ofSize uint64) <-chan []byte { }() return keys } + +func Benchmark_BtreeIndex_Allocation(b *testing.B) { + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + for i := 0; i < b.N; i++ { + now := time.Now() + count := rnd.Intn(1000000000) + bt := newBtAlloc(uint64(count), uint64(1<<12), true) + bt.traverseDfs() + fmt.Printf("alloc %v\n", time.Since(now)) + } +} +func Benchmark_BtreeIndex_Search(b *testing.B) { + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + // max := 100000000 + // count := rnd.Intn(max) + // bt := newBtAlloc(uint64(count), uint64(1<<11)) + // bt.traverseDfs() + // fmt.Printf("alloc %v\n", time.Since(now)) + + tmp := b.TempDir() + + // dataPath := generateCompressedKV(b, tmp, 52, 10, keyCount) + defer os.RemoveAll(tmp) + dir, _ := os.Getwd() + fmt.Printf("path %s\n", dir) + dataPath := "../../data/storage.256-288.kv" + + indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") + err := BuildBtreeIndex(dataPath, indexPath) + require.NoError(b, err) + + M := 1024 + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M)) + + require.NoError(b, err) + + idx := NewBtIndexReader(bt) + + keys, err := pivotKeysFromKV(dataPath) + require.NoError(b, err) + + for i := 0; i < b.N; i++ { + p := rnd.Intn(len(keys)) + cur, err := idx.Seek(keys[p]) + require.NoErrorf(b, err, "i=%d", i) + require.EqualValues(b, keys[p], cur.key) + require.NotEmptyf(b, cur.Value(), "i=%d", i) + } + + bt.Close() +} diff --git a/state/aggregator_test.go b/state/aggregator_test.go index a0ccd4c31..4bba5c18a 100644 --- a/state/aggregator_test.go +++ b/state/aggregator_test.go @@ -1,41 +1,89 @@ package state import ( + "bytes" "context" "encoding/binary" "fmt" "math/rand" "os" + "path" "path/filepath" "sync/atomic" "testing" + "time" "github.com/holiman/uint256" "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/require" + "github.com/ledgerwatch/erigon-lib/commitment" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/length" + "github.com/ledgerwatch/erigon-lib/compress" "github.com/ledgerwatch/erigon-lib/kv" "github.com/ledgerwatch/erigon-lib/kv/mdbx" + "github.com/ledgerwatch/erigon-lib/recsplit" ) -func testDbAndAggregator(t *testing.T, prefixLen int, aggStep uint64) (string, kv.RwDB, *Aggregator) { +func testDbAndAggregator(t *testing.T, aggStep uint64) (string, kv.RwDB, *Aggregator) { t.Helper() path := t.TempDir() - t.Cleanup(func() { os.RemoveAll(path) }) + //t.Cleanup(func() { os.RemoveAll(path) }) logger := log.New() db := mdbx.NewMDBX(logger).InMem(filepath.Join(path, "db4")).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { return kv.ChaindataTablesCfg }).MustOpen() t.Cleanup(db.Close) - agg, err := NewAggregator(path, path, aggStep) + agg, err := NewAggregator(filepath.Join(path, "e4"), filepath.Join(path, "e4tmp"), aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie) require.NoError(t, err) - t.Cleanup(agg.Close) return path, db, agg } +func TestAggregator_WinAccess(t *testing.T) { + _, db, agg := testDbAndAggregator(t, 100) + defer agg.Close() + + tx, err := db.BeginRwNosync(context.Background()) + require.NoError(t, err) + defer func() { + if tx != nil { + tx.Rollback() + } + }() + agg.SetTx(tx) + + agg.StartWrites() + + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + for txNum := uint64(1); txNum <= 100; txNum++ { + agg.SetTxNum(txNum) + + addr := make([]byte, length.Addr) + n, err := rnd.Read(addr) + require.NoError(t, err) + require.EqualValues(t, length.Addr, n) + + buf := EncodeAccountBytes(1, uint256.NewInt(uint64(rand.Intn(10e9))), nil, 0) + err = agg.UpdateAccountData(addr, buf) + require.NoError(t, err) + + var v [8]byte + binary.BigEndian.PutUint64(v[:], txNum) + require.NoError(t, err) + require.NoError(t, agg.FinishTx()) + } + agg.FinishWrites() + + require.NoError(t, err) + err = tx.Commit() + require.NoError(t, err) + tx = nil +} + func TestAggregator_Merge(t *testing.T) { - _, db, agg := testDbAndAggregator(t, 0, 100) + _, db, agg := testDbAndAggregator(t, 100) + defer agg.Close() tx, err := db.BeginRwNosync(context.Background()) require.NoError(t, err) @@ -46,17 +94,37 @@ func TestAggregator_Merge(t *testing.T) { }() agg.SetTx(tx) - defer agg.StartWrites().FinishWrites() + agg.StartWrites() + txs := uint64(10000) + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) // keys are encodings of numbers 1..31 // each key changes value on every txNum which is multiple of the key var maxWrite, otherMaxWrite uint64 for txNum := uint64(1); txNum <= txs; txNum++ { agg.SetTxNum(txNum) + + addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) + + n, err := rnd.Read(addr) + require.NoError(t, err) + require.EqualValues(t, length.Addr, n) + + n, err = rnd.Read(loc) + require.NoError(t, err) + require.EqualValues(t, length.Hash, n) + //keys[txNum-1] = append(addr, loc...) + + buf := EncodeAccountBytes(1, uint256.NewInt(0), nil, 0) + err = agg.UpdateAccountData(addr, buf) + require.NoError(t, err) + + err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + require.NoError(t, err) + var v [8]byte binary.BigEndian.PutUint64(v[:], txNum) - var err error if txNum%135 == 0 { err = agg.UpdateCommitmentData([]byte("otherroothash"), v[:]) otherMaxWrite = txNum @@ -67,7 +135,7 @@ func TestAggregator_Merge(t *testing.T) { require.NoError(t, err) require.NoError(t, agg.FinishTx()) } - err = agg.Flush(context.Background()) + agg.FinishWrites() require.NoError(t, err) err = tx.Commit() require.NoError(t, err) @@ -79,7 +147,7 @@ func TestAggregator_Merge(t *testing.T) { defer roTx.Rollback() dc := agg.MakeContext() - defer dc.Close() + v, err := dc.ReadCommitment([]byte("roothash"), roTx) require.NoError(t, err) @@ -87,6 +155,7 @@ func TestAggregator_Merge(t *testing.T) { v, err = dc.ReadCommitment([]byte("otherroothash"), roTx) require.NoError(t, err) + dc.Close() require.EqualValues(t, otherMaxWrite, binary.BigEndian.Uint64(v[:])) } @@ -98,7 +167,7 @@ func TestAggregator_Merge(t *testing.T) { // - new aggregator SeekCommitment must return txNum equal to amount of total txns func TestAggregator_RestartOnDatadir(t *testing.T) { aggStep := uint64(50) - path, db, agg := testDbAndAggregator(t, 0, aggStep) + path, db, agg := testDbAndAggregator(t, aggStep) tx, err := db.BeginRw(context.Background()) require.NoError(t, err) @@ -106,28 +175,13 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { if tx != nil { tx.Rollback() } - if agg != nil { - agg.Close() - } }() agg.SetTx(tx) - defer agg.StartWrites().FinishWrites() + agg.StartWrites() var latestCommitTxNum uint64 - commit := func(txn uint64) error { - err = agg.Flush(context.Background()) - require.NoError(t, err) - err = tx.Commit() - require.NoError(t, err) - tx, err = db.BeginRw(context.Background()) - require.NoError(t, err) - t.Logf("commit to db txn=%d", txn) - atomic.StoreUint64(&latestCommitTxNum, txn) - agg.SetTx(tx) - return nil - } - agg.SetCommitFn(commit) + rnd := rand.New(rand.NewSource(time.Now().Unix())) txs := (aggStep / 2) * 19 t.Logf("step=%d tx_count=%d", aggStep, txs) @@ -139,20 +193,38 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { agg.SetTxNum(txNum) binary.BigEndian.PutUint64(aux[:], txNum) + addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) + n, err := rnd.Read(addr) + require.NoError(t, err) + require.EqualValues(t, length.Addr, n) + + n, err = rnd.Read(loc) + require.NoError(t, err) + require.EqualValues(t, length.Hash, n) + //keys[txNum-1] = append(addr, loc...) + + buf := EncodeAccountBytes(1, uint256.NewInt(0), nil, 0) + err = agg.UpdateAccountData(addr, buf) + require.NoError(t, err) + + err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + require.NoError(t, err) + err = agg.UpdateCommitmentData([]byte("key"), aux[:]) + require.NoError(t, err) maxWrite = txNum require.NoError(t, agg.FinishTx()) } - err = agg.Flush(context.Background()) - require.NoError(t, err) + agg.FinishWrites() + agg.Close() + err = tx.Commit() require.NoError(t, err) - agg.Close() - tx, agg = nil, nil + tx = nil // Start another aggregator on same datadir - anotherAgg, err := NewAggregator(path, path, aggStep) + anotherAgg, err := NewAggregator(filepath.Join(path, "e4"), filepath.Join(path, "e4tmp"), aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie) require.NoError(t, err) require.NoError(t, anotherAgg.ReopenFolder()) @@ -182,9 +254,9 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { defer roTx.Rollback() dc := anotherAgg.MakeContext() - defer dc.Close() v, err := dc.ReadCommitment([]byte("key"), roTx) require.NoError(t, err) + dc.Close() require.EqualValues(t, maxWrite, binary.BigEndian.Uint64(v[:])) } @@ -192,9 +264,7 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { func TestAggregator_RestartOnFiles(t *testing.T) { aggStep := uint64(100) - path, db, agg := testDbAndAggregator(t, 0, aggStep) - defer db.Close() - _ = path + path, db, agg := testDbAndAggregator(t, aggStep) tx, err := db.BeginRw(context.Background()) require.NoError(t, err) @@ -202,26 +272,9 @@ func TestAggregator_RestartOnFiles(t *testing.T) { if tx != nil { tx.Rollback() } - if agg != nil { - agg.Close() - } }() agg.SetTx(tx) - defer agg.StartWrites().FinishWrites() - - var latestCommitTxNum uint64 - commit := func(txn uint64) error { - err = tx.Commit() - require.NoError(t, err) - tx, err = db.BeginRw(context.Background()) - require.NoError(t, err) - t.Logf("commit to db txn=%d", txn) - - atomic.StoreUint64(&latestCommitTxNum, txn) - agg.SetTx(tx) - return nil - } - agg.SetCommitFn(commit) + agg.StartWrites() txs := aggStep * 5 t.Logf("step=%d tx_count=%d\n", aggStep, txs) @@ -253,12 +306,14 @@ func TestAggregator_RestartOnFiles(t *testing.T) { err = agg.FinishTx() require.NoError(t, err) } + agg.FinishWrites() err = tx.Commit() + require.NoError(t, err) tx = nil db.Close() - db = nil agg.Close() + db = nil agg = nil require.NoError(t, os.RemoveAll(filepath.Join(path, "db4"))) @@ -273,27 +328,26 @@ func TestAggregator_RestartOnFiles(t *testing.T) { require.NoError(t, err) defer newTx.Rollback() - newAgg, err := NewAggregator(path, path, aggStep) + newAgg, err := NewAggregator(path, path, aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie) require.NoError(t, err) require.NoError(t, newAgg.ReopenFolder()) - defer newAgg.Close() newAgg.SetTx(newTx) + newAgg.StartWrites() latestTx, err := newAgg.SeekCommitment() require.NoError(t, err) t.Logf("seek to latest_tx=%d", latestTx) - ctx := newAgg.MakeContext() - defer ctx.Close() + ctx := newAgg.defaultCtx miss := uint64(0) for i, key := range keys { + if uint64(i+1) >= txs-aggStep { + continue // finishtx always stores last agg step in db which we deleted, so missing values which were not aggregated is expected + } stored, err := ctx.ReadAccountData(key[:length.Addr], newTx) require.NoError(t, err) if len(stored) == 0 { - if uint64(i+1) >= txs-aggStep { - continue // finishtx always stores last agg step in db which we deleteelete, so miss is expected - } miss++ fmt.Printf("%x [%d/%d]", key, miss, i+1) // txnum starts from 1 continue @@ -307,16 +361,18 @@ func TestAggregator_RestartOnFiles(t *testing.T) { require.EqualValues(t, key[0], storedV[0]) require.EqualValues(t, key[length.Addr], storedV[1]) } - require.NoError(t, err) + newAgg.FinishWrites() + ctx.Close() + newAgg.Close() + require.NoError(t, err) } func TestAggregator_ReplaceCommittedKeys(t *testing.T) { - aggStep := uint64(1000) + aggStep := uint64(10000) - path, db, agg := testDbAndAggregator(t, 0, aggStep) - defer db.Close() - _ = path + _, db, agg := testDbAndAggregator(t, aggStep) + t.Cleanup(agg.Close) tx, err := db.BeginRw(context.Background()) require.NoError(t, err) @@ -324,9 +380,6 @@ func TestAggregator_ReplaceCommittedKeys(t *testing.T) { if tx != nil { tx.Rollback() } - if agg != nil { - agg.Close() - } }() agg.SetTx(tx) defer agg.StartWrites().FinishWrites() @@ -343,9 +396,9 @@ func TestAggregator_ReplaceCommittedKeys(t *testing.T) { agg.SetTx(tx) return nil } - agg.SetCommitFn(commit) - txs := aggStep / 2 * 20 + roots := agg.AggregatedRoots() + txs := aggStep / 2 * 50 t.Logf("step=%d tx_count=%d", aggStep, txs) rnd := rand.New(rand.NewSource(0)) @@ -373,6 +426,12 @@ func TestAggregator_ReplaceCommittedKeys(t *testing.T) { err = agg.FinishTx() require.NoError(t, err) + select { + case <-roots: + require.NoError(t, commit(txNum)) + default: + continue + } } half := txs / 2 @@ -394,18 +453,14 @@ func TestAggregator_ReplaceCommittedKeys(t *testing.T) { tx, err = db.BeginRw(context.Background()) require.NoError(t, err) - ctx := agg.storage.MakeContext() - defer ctx.Close() + ctx := agg.defaultCtx for _, key := range keys { - storedV, err := ctx.Get(key[:length.Addr], key[length.Addr:], tx) + storedV, err := ctx.ReadAccountStorage(key[:length.Addr], key[length.Addr:], tx) require.NoError(t, err) require.EqualValues(t, key[0], storedV[0]) require.EqualValues(t, key[length.Addr], storedV[1]) } require.NoError(t, err) - - agg.Close() - agg = nil } func Test_EncodeCommitmentState(t *testing.T) { @@ -427,3 +482,303 @@ func Test_EncodeCommitmentState(t *testing.T) { require.EqualValues(t, cs.txNum, dec.txNum) require.EqualValues(t, cs.trieState, dec.trieState) } + +func Test_BtreeIndex_Seek(t *testing.T) { + tmp := t.TempDir() + + keyCount, M := 120000, 1024 + dataPath := generateCompressedKV(t, tmp, 52, 180 /*val size*/, keyCount) + defer os.RemoveAll(tmp) + + indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") + err := BuildBtreeIndex(dataPath, indexPath) + require.NoError(t, err) + + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M)) + require.NoError(t, err) + require.EqualValues(t, bt.KeyCount(), keyCount) + + keys, err := pivotKeysFromKV(dataPath) + require.NoError(t, err) + + for i := 0; i < len(keys); i++ { + cur, err := bt.Seek(keys[i]) + require.NoErrorf(t, err, "i=%d", i) + require.EqualValues(t, keys[i], cur.key) + require.NotEmptyf(t, cur.Value(), "i=%d", i) + // require.EqualValues(t, uint64(i), cur.Value()) + } + for i := 1; i < len(keys); i++ { + alt := common.Copy(keys[i]) + for j := len(alt) - 1; j >= 0; j-- { + if alt[j] > 0 { + alt[j] -= 1 + break + } + } + cur, err := bt.Seek(keys[i]) + require.NoError(t, err) + require.EqualValues(t, keys[i], cur.Key()) + } + + bt.Close() +} + +func pivotKeysFromKV(dataPath string) ([][]byte, error) { + decomp, err := compress.NewDecompressor(dataPath) + if err != nil { + return nil, err + } + + getter := decomp.MakeGetter() + getter.Reset(0) + + key := make([]byte, 0, 64) + + listing := make([][]byte, 0, 1000) + + for getter.HasNext() { + if len(listing) > 100000 { + break + } + key, _ := getter.Next(key[:0]) + listing = append(listing, common.Copy(key)) + getter.Skip() + } + decomp.Close() + + return listing, nil +} + +func generateCompressedKV(tb testing.TB, tmp string, keySize, valueSize, keyCount int) string { + tb.Helper() + + args := BtIndexWriterArgs{ + IndexFile: path.Join(tmp, fmt.Sprintf("%dk.bt", keyCount/1000)), + TmpDir: tmp, + KeyCount: 12, + } + + iw, err := NewBtIndexWriter(args) + require.NoError(tb, err) + + defer iw.Close() + rnd := rand.New(rand.NewSource(0)) + values := make([]byte, valueSize) + + dataPath := path.Join(tmp, fmt.Sprintf("%dk.kv", keyCount/1000)) + comp, err := compress.NewCompressor(context.Background(), "cmp", dataPath, tmp, compress.MinPatternScore, 1, log.LvlDebug) + require.NoError(tb, err) + + for i := 0; i < keyCount; i++ { + key := make([]byte, keySize) + n, err := rnd.Read(key[:]) + require.EqualValues(tb, keySize, n) + binary.BigEndian.PutUint64(key[keySize-8:], uint64(i)) + require.NoError(tb, err) + err = comp.AddWord(key[:]) + require.NoError(tb, err) + + n, err = rnd.Read(values[:rnd.Intn(valueSize)+1]) + require.NoError(tb, err) + + err = comp.AddWord(values[:n]) + require.NoError(tb, err) + } + + err = comp.Compress() + require.NoError(tb, err) + comp.Close() + + decomp, err := compress.NewDecompressor(dataPath) + require.NoError(tb, err) + + getter := decomp.MakeGetter() + getter.Reset(0) + + var pos uint64 + key := make([]byte, keySize) + for i := 0; i < keyCount; i++ { + if !getter.HasNext() { + tb.Fatalf("not enough values at %d", i) + break + } + + keys, _ := getter.Next(key[:0]) + err = iw.AddKey(keys[:], pos) + + pos = getter.Skip() + require.NoError(tb, err) + } + decomp.Close() + + require.NoError(tb, iw.Build()) + iw.Close() + + return decomp.FilePath() +} + +func Test_InitBtreeIndex(t *testing.T) { + tmp := t.TempDir() + defer os.RemoveAll(tmp) + + keyCount, M := 100, uint64(4) + compPath := generateCompressedKV(t, tmp, 52, 300, keyCount) + decomp, err := compress.NewDecompressor(compPath) + require.NoError(t, err) + defer decomp.Close() + + err = BuildBtreeIndexWithDecompressor(tmp+".bt", decomp) + require.NoError(t, err) + + bt, err := OpenBtreeIndexWithDecompressor(tmp+".bt", M, decomp) + require.NoError(t, err) + require.EqualValues(t, bt.KeyCount(), keyCount) + bt.Close() +} + +func Test_BtreeIndex_Allocation(t *testing.T) { + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + m := 2 + for i := 5; i < 24; i++ { + t.Run(fmt.Sprintf("%d", m< (m<<1)*4 { + break + } + } + + bt := newBtAlloc(uint64(count), uint64(m)< 0 { + t.Fatalf("prev %s cur %s, next key should be greater", prevKey, nk) + } + prevKey = nk + } + if i%1000 == 0 { + fmt.Printf("%d searches, last took %v avg=%v next_access_last[of %d keys] %v\n", i, took, tsum/time.Duration(i), j, ntimer/time.Duration(j)) + } + + } + avg := tsum / (1000000 - 1) + fmt.Printf("avg seek time %v\n", avg) + + bt.Close() +} + +func Test_Recsplit_Find(t *testing.T) { + t.Skip() + + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + tmp := t.TempDir() + + defer os.RemoveAll(tmp) + dir, _ := os.Getwd() + fmt.Printf("path %s\n", dir) + dataPath := "../../data/storage.256-288.kv" + indexPath := dataPath + "i" + + idx, err := recsplit.OpenIndex(indexPath) + require.NoError(t, err) + idxr := recsplit.NewIndexReader(idx) + + decomp, err := compress.NewDecompressor(dataPath) + require.NoError(t, err) + defer decomp.Close() + + getter := decomp.MakeGetter() + + keys, err := pivotKeysFromKV(dataPath) + require.NoError(t, err) + + tsum := time.Duration(0) + + var i int + for i = 1; i < 10000000; i++ { + p := rnd.Intn(len(keys)) + cl := time.Now() + offset := idxr.Lookup(keys[p]) + getter.Reset(offset) + + require.True(t, getter.HasNext()) + + key, pa := getter.Next(nil) + require.NotEmpty(t, key) + + value, pb := getter.Next(nil) + if pb-pa != 1 { + require.NotEmpty(t, value) + } + + took := time.Since(cl) + tsum += took + + require.NoErrorf(t, err, "i=%d", i) + require.EqualValues(t, keys[p], key) + + if i%1000 == 0 { + fmt.Printf("%d searches, last took %v avg=%v\n", i, took, tsum/time.Duration(i)) + } + + } + avg := tsum / (1000000 - 1) + fmt.Printf("avg seek time %v\n", avg) +} diff --git a/state/btree_index.go b/state/btree_index.go new file mode 100644 index 000000000..ec632cf07 --- /dev/null +++ b/state/btree_index.go @@ -0,0 +1,1074 @@ +package state + +import ( + "bufio" + "bytes" + "context" + "encoding/binary" + "fmt" + "math" + "math/bits" + "os" + "path" + "path/filepath" + "time" + + "github.com/c2h5oh/datasize" + "github.com/ledgerwatch/log/v3" + + "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/common/length" + "github.com/ledgerwatch/erigon-lib/compress" + "github.com/ledgerwatch/erigon-lib/etl" + "github.com/ledgerwatch/erigon-lib/mmap" +) + +func logBase(n, base uint64) uint64 { + return uint64(math.Ceil(math.Log(float64(n)) / math.Log(float64(base)))) +} + +func min64(a, b uint64) uint64 { + if a < b { + return a + } + return b +} + +type markupCursor struct { + l, p, di, si uint64 + //l - level + //p - pos inside level + //si - current, actual son index + //di - data array index +} + +type node struct { + p, d, s, fc uint64 + key []byte + val []byte +} + +type Cursor struct { + ctx context.Context + ix *btAlloc + + key []byte + value []byte + d uint64 +} + +func (a *btAlloc) newCursor(ctx context.Context, k, v []byte, d uint64) *Cursor { + return &Cursor{ + ctx: ctx, + key: common.Copy(k), + value: common.Copy(v), + d: d, + ix: a, + } +} + +func (c *Cursor) Key() []byte { + return c.key +} + +func (c *Cursor) Ordinal() uint64 { + return c.d +} + +func (c *Cursor) Value() []byte { + return c.value +} + +func (c *Cursor) Next() bool { + if c.d > c.ix.K-1 { + return false + } + k, v, err := c.ix.dataLookup(c.d + 1) + if err != nil { + return false + } + c.key = common.Copy(k) + c.value = common.Copy(v) + c.d++ + return true +} + +type btAlloc struct { + d uint64 // depth + M uint64 // child limit of any node + N uint64 + K uint64 + vx []uint64 // vertex count on level + sons [][]uint64 // i - level; 0 <= i < d; j_k - amount, j_k+1 - child count + cursors []markupCursor + nodes [][]node + naccess uint64 + trace bool + + dataLookup func(di uint64) ([]byte, []byte, error) +} + +func newBtAlloc(k, M uint64, trace bool) *btAlloc { + if k == 0 { + return nil + } + + d := logBase(k, M) + a := &btAlloc{ + vx: make([]uint64, d+1), + sons: make([][]uint64, d+1), + cursors: make([]markupCursor, d), + nodes: make([][]node, d), + M: M, + K: k, + d: d, + trace: trace, + } + if trace { + fmt.Printf("k=%d d=%d, M=%d\n", k, d, M) + } + a.vx[0], a.vx[d] = 1, k + + if k < M/2 { + a.N = k + a.nodes = make([][]node, 1) + return a + } + + //nnc := func(vx uint64) uint64 { + // return uint64(math.Ceil(float64(vx) / float64(M))) + //} + nvc := func(vx uint64) uint64 { + return uint64(math.Ceil(float64(vx) / float64(M>>1))) + } + + for i := a.d - 1; i > 0; i-- { + nnc := uint64(math.Ceil(float64(a.vx[i+1]) / float64(M))) + //nvc := uint64(math.Floor(float64(a.vx[i+1]) / float64(m))-1) + //nnc := a.vx[i+1] / M + //nvc := a.vx[i+1] / m + //bvc := a.vx[i+1] / (m + (m >> 1)) + a.vx[i] = min64(uint64(math.Pow(float64(M), float64(i))), nnc) + } + + ncount := uint64(0) + pnv := uint64(0) + for l := a.d - 1; l > 0; l-- { + //s := nnc(a.vx[l+1]) + sh := nvc(a.vx[l+1]) + + if sh&1 == 1 { + a.sons[l] = append(a.sons[l], sh>>1, M, 1, M>>1) + } else { + a.sons[l] = append(a.sons[l], sh>>1, M) + } + + for ik := 0; ik < len(a.sons[l]); ik += 2 { + ncount += a.sons[l][ik] * a.sons[l][ik+1] + if l == 1 { + pnv += a.sons[l][ik] + } + } + } + a.sons[0] = []uint64{1, pnv} + ncount += a.sons[0][0] * a.sons[0][1] // last one + a.N = ncount + + if trace { + for i, v := range a.sons { + fmt.Printf("L%d=%v\n", i, v) + } + } + + return a +} + +// nolint +// another implementation of traverseDfs supposed to be a bit cleaner but buggy yet +func (a *btAlloc) traverseTrick() { + for l := 0; l < len(a.sons)-1; l++ { + if len(a.sons[l]) < 2 { + panic("invalid btree allocation markup") + } + a.cursors[l] = markupCursor{uint64(l), 1, 0, 0} + a.nodes[l] = make([]node, 0) + } + + lf := a.cursors[len(a.cursors)-1] + c := a.cursors[(len(a.cursors) - 2)] + + var d uint64 + var fin bool + + lf.di = d + lf.si++ + d++ + a.cursors[len(a.cursors)-1] = lf + + moved := true + for int(c.p) <= len(a.sons[c.l]) { + if fin || d > a.K { + break + } + c, lf = a.cursors[c.l], a.cursors[lf.l] + + c.di = d + c.si++ + + sons := a.sons[lf.l][lf.p] + for i := uint64(1); i < sons; i++ { + lf.si++ + d++ + } + lf.di = d + d++ + + a.nodes[lf.l] = append(a.nodes[lf.l], node{p: lf.p, s: lf.si, d: lf.di}) + a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, s: c.si, d: c.di}) + a.cursors[lf.l] = lf + a.cursors[c.l] = c + + for l := lf.l; l >= 0; l-- { + sc := a.cursors[l] + sons, gsons := a.sons[sc.l][sc.p-1], a.sons[sc.l][sc.p] + if l < c.l && moved { + sc.di = d + a.nodes[sc.l] = append(a.nodes[sc.l], node{d: sc.di}) + sc.si++ + d++ + } + moved = (sc.si-1)/gsons != sc.si/gsons + if sc.si/gsons >= sons { + sz := uint64(len(a.sons[sc.l]) - 1) + if sc.p+2 > sz { + fin = l == lf.l + break + } else { + sc.p += 2 + sc.si, sc.di = 0, 0 + } + //moved = true + } + if l == lf.l { + sc.si++ + sc.di = d + d++ + } + a.cursors[l] = sc + if l == 0 { + break + } + } + moved = false + } +} + +func (a *btAlloc) traverseDfs() { + for l := 0; l < len(a.sons)-1; l++ { + a.cursors[l] = markupCursor{uint64(l), 1, 0, 0} + a.nodes[l] = make([]node, 0) + } + + if len(a.cursors) <= 1 { + if a.nodes[0] == nil { + a.nodes[0] = make([]node, 0) + } + a.nodes[0] = append(a.nodes[0], node{d: a.K}) + a.N = a.K + if a.trace { + fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N)) + } + return + } + + c := a.cursors[len(a.cursors)-1] + pc := a.cursors[(len(a.cursors) - 2)] + root := new(node) + trace := false + + var di uint64 + for stop := false; !stop; { + // fill leaves, mark parent if needed (until all grandparents not marked up until root) + // check if eldest parent has brothers + // -- has bros -> fill their leaves from the bottom + // -- no bros -> shift cursor (tricky) + if di > a.K { + a.N = di - 1 // actually filled node count + if a.trace { + fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N)) + } + break + } + + bros, parents := a.sons[c.l][c.p], a.sons[c.l][c.p-1] + for i := uint64(0); i < bros; i++ { + c.di = di + if trace { + fmt.Printf("L%d |%d| d %2d s %2d\n", c.l, c.p, c.di, c.si) + } + c.si++ + di++ + + if i == 0 { + pc.di = di + if trace { + fmt.Printf("P%d |%d| d %2d s %2d\n", pc.l, pc.p, pc.di, pc.si) + } + pc.si++ + di++ + } + if di > a.K { + a.N = di - 1 // actually filled node count + stop = true + break + } + } + + a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, d: c.di, s: c.si}) + a.nodes[pc.l] = append(a.nodes[pc.l], node{p: pc.p, d: pc.di, s: pc.si, fc: uint64(len(a.nodes[c.l]) - 1)}) + + pid := c.si / bros + if pid >= parents { + if c.p+2 >= uint64(len(a.sons[c.l])) { + stop = true // end of row + if trace { + fmt.Printf("F%d |%d| d %2d\n", c.l, c.p, c.di) + } + } else { + c.p += 2 + c.si = 0 + c.di = 0 + } + } + a.cursors[c.l] = c + a.cursors[pc.l] = pc + + //nolint + for l := pc.l; l >= 0; l-- { + pc := a.cursors[l] + uncles := a.sons[pc.l][pc.p] + grands := a.sons[pc.l][pc.p-1] + + pi1 := pc.si / uncles + pc.si++ + pc.di = 0 + + pi2 := pc.si / uncles + moved := pi2-pi1 != 0 + + switch { + case pc.l > 0: + gp := a.cursors[pc.l-1] + if gp.di == 0 { + gp.di = di + di++ + if trace { + fmt.Printf("P%d |%d| d %2d s %2d\n", gp.l, gp.p, gp.di, gp.si) + } + a.nodes[gp.l] = append(a.nodes[gp.l], node{p: gp.p, d: gp.di, s: gp.si, fc: uint64(len(a.nodes[l]) - 1)}) + a.cursors[gp.l] = gp + } + default: + if root.d == 0 { + root.d = di + //di++ + if trace { + fmt.Printf("ROOT | d %2d\n", root.d) + } + } + } + + //fmt.Printf("P%d |%d| d %2d s %2d pid %d\n", pc.l, pc.p, pc.di, pc.si-1) + if pi2 >= grands { // skip one step of si due to different parental filling order + if pc.p+2 >= uint64(len(a.sons[pc.l])) { + if trace { + fmt.Printf("EoRow %d |%d|\n", pc.l, pc.p) + } + break // end of row + } + //fmt.Printf("N %d d%d s%d\n", pc.l, pc.di, pc.si) + //fmt.Printf("P%d |%d| d %2d s %2d pid %d\n", pc.l, pc.p, pc.di, pc.si, pid) + pc.p += 2 + pc.si = 0 + pc.di = 0 + } + a.cursors[pc.l] = pc + + if !moved { + break + } + } + } + + if a.trace { + fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N)) + } +} + +func (a *btAlloc) bsKey(x []byte, l, r uint64) (*Cursor, error) { + for l <= r { + di := (l + r) >> 1 + + mk, value, err := a.dataLookup(di) + a.naccess++ + + cmp := bytes.Compare(mk, x) + switch { + case err != nil: + return nil, err + case cmp == 0: + return a.newCursor(context.TODO(), mk, value, di), nil + case cmp == -1: + l = di + 1 + default: + r = di + } + if l == r { + break + } + } + k, v, err := a.dataLookup(l) + if err != nil { + return nil, fmt.Errorf("key >= %x was not found at pos %d", x, l) + } + return a.newCursor(context.TODO(), k, v, l), nil +} + +func (a *btAlloc) bsNode(i, l, r uint64, x []byte) (n node, lm int64, rm int64) { + n, lm, rm = node{}, -1, -1 + + for l < r { + m := (l + r) >> 1 + + n = a.nodes[i][m] + a.naccess++ + + cmp := bytes.Compare(n.key, x) + switch { + case cmp == 0: + return n, int64(m), int64(m) + case cmp > 0: + r = m + rm = int64(m) + case cmp < 0: + lm = int64(m) + l = m + 1 + default: + panic(fmt.Errorf("compare error %d, %x ? %x", cmp, n.key, x)) + } + } + return n, lm, rm +} + +// find position of key with node.di <= d at level lvl +func (a *btAlloc) seekLeast(lvl, d uint64) uint64 { + for i, node := range a.nodes[lvl] { + if node.d >= d { + return uint64(i) + } + } + return uint64(len(a.nodes[lvl])) +} + +func (a *btAlloc) Seek(ik []byte) (*Cursor, error) { + if a.trace { + fmt.Printf("seek key %x\n", ik) + } + + var ( + lm, rm int64 + L, R = uint64(0), uint64(len(a.nodes[0]) - 1) + minD, maxD = uint64(0), a.K + ln node + ) + + for l, level := range a.nodes { + if len(level) == 1 && l == 0 { + ln = a.nodes[0][0] + maxD = ln.d + break + } + ln, lm, rm = a.bsNode(uint64(l), L, R, ik) + if ln.key == nil { // should return node which is nearest to key from the left so never nil + if a.trace { + fmt.Printf("found nil key %x pos_range[%d-%d] naccess_ram=%d\n", l, lm, rm, a.naccess) + } + panic(fmt.Errorf("bt index nil node at level %d", l)) + } + + switch bytes.Compare(ln.key, ik) { + case 1: // key > ik + maxD = ln.d + case -1: // key < ik + minD = ln.d + case 0: + if a.trace { + fmt.Printf("found key %x v=%x naccess_ram=%d\n", ik, ln.val /*level[m].d,*/, a.naccess) + } + return a.newCursor(context.TODO(), common.Copy(ln.key), common.Copy(ln.val), ln.d), nil + } + + if rm-lm == 1 { + break + } + if lm >= 0 { + minD = a.nodes[l][lm].d + L = level[lm].fc + } else if l+1 != len(a.nodes) { + L = a.seekLeast(uint64(l+1), minD) + if L == uint64(len(a.nodes[l+1])) { + L-- + } + } + if rm >= 0 { + maxD = a.nodes[l][rm].d + R = level[rm].fc + } else if l+1 != len(a.nodes) { + R = a.seekLeast(uint64(l+1), maxD) + if R == uint64(len(a.nodes[l+1])) { + R-- + } + } + + if a.trace { + fmt.Printf("range={%x d=%d p=%d} (%d, %d) L=%d naccess_ram=%d\n", ln.key, ln.d, ln.p, minD, maxD, l, a.naccess) + } + } + + a.naccess = 0 // reset count before actually go to disk + cursor, err := a.bsKey(ik, minD, maxD) + if err != nil { + if a.trace { + fmt.Printf("key %x not found\n", ik) + } + return nil, err + } + + if a.trace { + fmt.Printf("finally found key %x v=%x naccess_disk=%d\n", cursor.key, cursor.value, a.naccess) + } + return cursor, nil +} + +func (a *btAlloc) fillSearchMx() { + for i, n := range a.nodes { + if a.trace { + fmt.Printf("D%d |%d| ", i, len(n)) + } + for j, s := range n { + if a.trace { + fmt.Printf("%d ", s.d) + } + if s.d >= a.K { + break + } + + kb, v, err := a.dataLookup(s.d) + if err != nil { + fmt.Printf("d %d not found %v\n", s.d, err) + } + a.nodes[i][j].key = common.Copy(kb) + a.nodes[i][j].val = common.Copy(v) + } + if a.trace { + fmt.Printf("\n") + } + } +} + +// deprecated +type BtIndexReader struct { + index *BtIndex +} + +func NewBtIndexReader(index *BtIndex) *BtIndexReader { + return &BtIndexReader{ + index: index, + } +} + +// Lookup wraps index Lookup +func (r *BtIndexReader) Lookup(key []byte) uint64 { + if r.index != nil { + return r.index.Lookup(key) + } + return 0 +} + +func (r *BtIndexReader) Lookup2(key1, key2 []byte) uint64 { + fk := make([]byte, 52) + copy(fk[:length.Addr], key1) + copy(fk[length.Addr:], key2) + + if r.index != nil { + return r.index.Lookup(fk) + } + return 0 +} + +func (r *BtIndexReader) Seek(x []byte) (*Cursor, error) { + if r.index != nil { + cursor, err := r.index.alloc.Seek(x) + if err != nil { + return nil, fmt.Errorf("seek key %x: %w", x, err) + } + return cursor, nil + } + return nil, fmt.Errorf("seek has been failed") +} + +func (r *BtIndexReader) Empty() bool { + return r.index.Empty() +} + +type BtIndexWriter struct { + built bool + lvl log.Lvl + maxOffset uint64 + prevOffset uint64 + minDelta uint64 + indexW *bufio.Writer + indexF *os.File + bucketCollector *etl.Collector // Collector that sorts by buckets + indexFileName string + indexFile string + tmpDir string + numBuf [8]byte + keyCount uint64 + etlBufLimit datasize.ByteSize + bytesPerRec int +} + +type BtIndexWriterArgs struct { + IndexFile string // File name where the index and the minimal perfect hash function will be written to + TmpDir string + KeyCount int + EtlBufLimit datasize.ByteSize +} + +const BtreeLogPrefix = "btree" + +// NewBtIndexWriter creates a new BtIndexWriter instance with given number of keys +// Typical bucket size is 100 - 2048, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access +// salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes) +// are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time +func NewBtIndexWriter(args BtIndexWriterArgs) (*BtIndexWriter, error) { + btw := &BtIndexWriter{lvl: log.LvlDebug} + btw.tmpDir = args.TmpDir + btw.indexFile = args.IndexFile + + _, fname := filepath.Split(btw.indexFile) + btw.indexFileName = fname + btw.etlBufLimit = args.EtlBufLimit + if btw.etlBufLimit == 0 { + btw.etlBufLimit = etl.BufferOptimalSize + } + + btw.bucketCollector = etl.NewCollector(BtreeLogPrefix+" "+fname, btw.tmpDir, etl.NewSortableBuffer(btw.etlBufLimit)) + btw.bucketCollector.LogLvl(log.LvlDebug) + + btw.maxOffset = 0 + return btw, nil +} + +// loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load +func (btw *BtIndexWriter) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error { + // k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket + //if uint64(len(btw.vals)) >= btw.batchSizeLimit { + // if err := btw.drainBatch(); err != nil { + // return err + // } + //} + + // if _, err := btw.indexW.Write(k); err != nil { + // return err + // } + if _, err := btw.indexW.Write(v[8-btw.bytesPerRec:]); err != nil { + return err + } + + //btw.keys = append(btw.keys, binary.BigEndian.Uint64(k), binary.BigEndian.Uint64(k[8:])) + //btw.vals = append(btw.vals, binary.BigEndian.Uint64(v)) + return nil +} + +// Build has to be called after all the keys have been added, and it initiates the process +// of building the perfect hash function and writing index into a file +func (btw *BtIndexWriter) Build() error { + tmpIdxFilePath := btw.indexFile + ".tmp" + + if btw.built { + return fmt.Errorf("already built") + } + //if btw.keysAdded != btw.keyCount { + // return fmt.Errorf("expected keys %d, got %d", btw.keyCount, btw.keysAdded) + //} + var err error + if btw.indexF, err = os.Create(tmpIdxFilePath); err != nil { + return fmt.Errorf("create index file %s: %w", btw.indexFile, err) + } + defer btw.indexF.Sync() + defer btw.indexF.Close() + btw.indexW = bufio.NewWriterSize(btw.indexF, etl.BufIOSize) + defer btw.indexW.Flush() + + // Write number of keys + binary.BigEndian.PutUint64(btw.numBuf[:], btw.keyCount) + if _, err = btw.indexW.Write(btw.numBuf[:]); err != nil { + return fmt.Errorf("write number of keys: %w", err) + } + // Write number of bytes per index record + btw.bytesPerRec = (bits.Len64(btw.maxOffset) + 7) / 8 + if err = btw.indexW.WriteByte(byte(btw.bytesPerRec)); err != nil { + return fmt.Errorf("write bytes per record: %w", err) + } + + defer btw.bucketCollector.Close() + log.Log(btw.lvl, "[index] calculating", "file", btw.indexFileName) + if err := btw.bucketCollector.Load(nil, "", btw.loadFuncBucket, etl.TransformArgs{}); err != nil { + return err + } + + log.Log(btw.lvl, "[index] write", "file", btw.indexFileName) + btw.built = true + + _ = btw.indexW.Flush() + _ = btw.indexF.Sync() + _ = btw.indexF.Close() + _ = os.Rename(tmpIdxFilePath, btw.indexFile) + return nil +} + +func (btw *BtIndexWriter) Close() { + if btw.indexF != nil { + btw.indexF.Close() + } + if btw.bucketCollector != nil { + btw.bucketCollector.Close() + } + //if btw.offsetCollector != nil { + // btw.offsetCollector.Close() + //} +} + +func (btw *BtIndexWriter) AddKey(key []byte, offset uint64) error { + if btw.built { + return fmt.Errorf("cannot add keys after perfect hash function had been built") + } + + binary.BigEndian.PutUint64(btw.numBuf[:], offset) + if offset > btw.maxOffset { + btw.maxOffset = offset + } + if btw.keyCount > 0 { + delta := offset - btw.prevOffset + if btw.keyCount == 1 || delta < btw.minDelta { + btw.minDelta = delta + } + } + + if err := btw.bucketCollector.Collect(key[:], btw.numBuf[:]); err != nil { + return err + } + btw.keyCount++ + btw.prevOffset = offset + return nil +} + +type BtIndex struct { + alloc *btAlloc + mmapWin *[mmap.MaxMapSize]byte + mmapUnix []byte + data []byte + file *os.File + size int64 + modTime time.Time + filePath string + keyCount uint64 + bytesPerRec int + dataoffset uint64 + auxBuf []byte + decompressor *compress.Decompressor + getter *compress.Getter +} + +func CreateBtreeIndex(indexPath, dataPath string, M uint64) (*BtIndex, error) { + err := BuildBtreeIndex(dataPath, indexPath) + if err != nil { + return nil, err + } + return OpenBtreeIndex(indexPath, dataPath, M) +} + +func BuildBtreeIndexWithDecompressor(indexPath string, kv *compress.Decompressor) error { + args := BtIndexWriterArgs{ + IndexFile: indexPath, + TmpDir: filepath.Dir(indexPath), + } + + iw, err := NewBtIndexWriter(args) + if err != nil { + return err + } + + getter := kv.MakeGetter() + getter.Reset(0) + + key := make([]byte, 0, 64) + ks := make(map[int]int) + + var pos uint64 + emptys := 0 + for getter.HasNext() { + key, kp := getter.Next(key[:0]) + err = iw.AddKey(key[:], pos) + if err != nil { + return err + } + + pos = getter.Skip() + if pos-kp == 1 { + ks[len(key)]++ + emptys++ + } + } + //fmt.Printf("emptys %d %#+v\n", emptys, ks) + + if err := iw.Build(); err != nil { + return err + } + iw.Close() + return nil +} + +// Opens .kv at dataPath and generates index over it to file 'indexPath' +func BuildBtreeIndex(dataPath, indexPath string) error { + decomp, err := compress.NewDecompressor(dataPath) + if err != nil { + return err + } + + args := BtIndexWriterArgs{ + IndexFile: indexPath, + TmpDir: filepath.Dir(indexPath), + } + + iw, err := NewBtIndexWriter(args) + if err != nil { + return err + } + + getter := decomp.MakeGetter() + getter.Reset(0) + + key := make([]byte, 0, 64) + + var pos uint64 + for getter.HasNext() { + key, _ := getter.Next(key[:0]) + err = iw.AddKey(key[:], pos) + if err != nil { + return err + } + + pos = getter.Skip() + } + decomp.Close() + + if err := iw.Build(); err != nil { + return err + } + iw.Close() + return nil +} + +func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Decompressor) (*BtIndex, error) { + s, err := os.Stat(indexPath) + if err != nil { + return nil, err + } + + idx := &BtIndex{ + filePath: indexPath, + size: s.Size(), + modTime: s.ModTime(), + auxBuf: make([]byte, 64), + } + + idx.file, err = os.Open(indexPath) + if err != nil { + return nil, err + } + + if idx.mmapUnix, idx.mmapWin, err = mmap.Mmap(idx.file, int(idx.size)); err != nil { + return nil, err + } + idx.data = idx.mmapUnix[:idx.size] + + // Read number of keys and bytes per record + pos := 8 + idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos]) + if idx.keyCount == 0 { + return idx, nil + } + idx.bytesPerRec = int(idx.data[pos]) + pos += 1 + + //p := (*[]byte)(unsafe.Pointer(&idx.data[pos])) + //l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount)) + + idx.getter = kv.MakeGetter() + + idx.alloc = newBtAlloc(idx.keyCount, M, false) + idx.alloc.dataLookup = idx.dataLookup + idx.dataoffset = uint64(pos) + idx.alloc.traverseDfs() + idx.alloc.fillSearchMx() + return idx, nil +} + +func OpenBtreeIndex(indexPath, dataPath string, M uint64) (*BtIndex, error) { + s, err := os.Stat(indexPath) + if err != nil { + return nil, err + } + + idx := &BtIndex{ + filePath: indexPath, + size: s.Size(), + modTime: s.ModTime(), + auxBuf: make([]byte, 64), + } + + idx.file, err = os.Open(indexPath) + if err != nil { + return nil, err + } + + if idx.mmapUnix, idx.mmapWin, err = mmap.Mmap(idx.file, int(idx.size)); err != nil { + return nil, err + } + idx.data = idx.mmapUnix[:idx.size] + + // Read number of keys and bytes per record + pos := 8 + idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos]) + idx.bytesPerRec = int(idx.data[pos]) + pos += 1 + + // offset := int(idx.keyCount) * idx.bytesPerRec //+ (idx.keySize * int(idx.keyCount)) + // if offset < 0 { + // return nil, fmt.Errorf("offset is: %d which is below zero, the file: %s is broken", offset, indexPath) + // } + + //p := (*[]byte)(unsafe.Pointer(&idx.data[pos])) + //l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount)) + + idx.decompressor, err = compress.NewDecompressor(dataPath) + if err != nil { + idx.Close() + return nil, err + } + idx.getter = idx.decompressor.MakeGetter() + + idx.alloc = newBtAlloc(idx.keyCount, M, false) + idx.alloc.dataLookup = idx.dataLookup + idx.dataoffset = uint64(pos) + idx.alloc.traverseDfs() + idx.alloc.fillSearchMx() + return idx, nil +} + +func (b *BtIndex) dataLookup(di uint64) ([]byte, []byte, error) { + if b.keyCount < di { + return nil, nil, fmt.Errorf("ki is greater than key count in index") + } + + p := b.dataoffset + di*uint64(b.bytesPerRec) + if uint64(len(b.data)) < p+uint64(b.bytesPerRec) { + return nil, nil, fmt.Errorf("data lookup gone too far (%d after %d)", p+uint64(b.bytesPerRec)-uint64(len(b.data)), len(b.data)) + } + + offt := b.data[p : p+uint64(b.bytesPerRec)] + var aux [8]byte + copy(aux[8-len(offt):], offt) + + offset := binary.BigEndian.Uint64(aux[:]) + b.getter.Reset(offset) + if !b.getter.HasNext() { + return nil, nil, fmt.Errorf("pair %d not found", di) + } + + key, kp := b.getter.Next(nil) + + if !b.getter.HasNext() { + return nil, nil, fmt.Errorf("pair %d not found", di) + } + val, vp := b.getter.Next(nil) + _, _ = kp, vp + return key, val, nil +} + +func (b *BtIndex) Size() int64 { return b.size } + +func (b *BtIndex) ModTime() time.Time { return b.modTime } + +func (b *BtIndex) FilePath() string { return b.filePath } + +func (b *BtIndex) FileName() string { return path.Base(b.filePath) } + +func (b *BtIndex) Empty() bool { return b.keyCount == 0 } + +func (b *BtIndex) KeyCount() uint64 { return b.keyCount } + +func (b *BtIndex) Close() error { + if b == nil { + return nil + } + if err := mmap.Munmap(b.mmapUnix, b.mmapWin); err != nil { + return err + } + if err := b.file.Close(); err != nil { + return err + } + if b.decompressor != nil { + if err := b.decompressor.Close(); err != nil { + return err + } + } + return nil +} + +func (b *BtIndex) Seek(x []byte) (*Cursor, error) { + if b.alloc != nil { + cursor, err := b.alloc.Seek(x) + if err != nil { + return nil, fmt.Errorf("seek key %x: %w", x, err) + } + return cursor, nil + } + return nil, fmt.Errorf("seek has been failed") +} + +// deprecated +func (b *BtIndex) Lookup(key []byte) uint64 { + cursor, err := b.alloc.Seek(key) + if err != nil { + panic(err) + } + return binary.BigEndian.Uint64(cursor.value) +} + +func (b *BtIndex) OrdinalLookup(i uint64) *Cursor { + if i > b.alloc.K { + return nil + } + k, v, err := b.dataLookup(i) + if err != nil { + return nil + } + + return &Cursor{ + key: k, value: v, d: i, ix: b.alloc, + } +} diff --git a/state/domain.go b/state/domain.go index e4033f217..7a26907d8 100644 --- a/state/domain.go +++ b/state/domain.go @@ -27,10 +27,17 @@ import ( "path/filepath" "regexp" "strconv" + "strings" "sync/atomic" "time" "github.com/RoaringBitmap/roaring/roaring64" + btree2 "github.com/tidwall/btree" + atomic2 "go.uber.org/atomic" + "golang.org/x/sync/errgroup" + + "github.com/ledgerwatch/log/v3" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/dir" "github.com/ledgerwatch/erigon-lib/compress" @@ -38,10 +45,6 @@ import ( "github.com/ledgerwatch/erigon-lib/kv/bitmapdb" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" - "github.com/ledgerwatch/log/v3" - btree2 "github.com/tidwall/btree" - atomic2 "go.uber.org/atomic" - "golang.org/x/sync/errgroup" ) var ( @@ -52,6 +55,7 @@ var ( type filesItem struct { decompressor *compress.Decompressor index *recsplit.Index + bindex *BtIndex startTxNum uint64 endTxNum uint64 @@ -95,11 +99,25 @@ func (i *filesItem) closeFilesAndRemove() { } i.index = nil } + if i.bindex != nil { + if err := i.bindex.Close(); err != nil { + log.Trace("close", "err", err, "file", i.bindex.FileName()) + } + if err := os.Remove(i.bindex.FilePath()); err != nil { + log.Trace("close", "err", err, "file", i.bindex.FileName()) + } + i.bindex = nil + } } type DomainStats struct { - MergesCount uint64 + MergesCount uint64 + LastCollationTook time.Duration + LastPruneTook time.Duration + LastFileBuildingTook time.Duration + HistoryQueries uint64 + TotalQueries uint64 EfSearchTime time.Duration DataSize uint64 IndexSize uint64 @@ -108,6 +126,7 @@ type DomainStats struct { func (ds *DomainStats) Accumulate(other DomainStats) { ds.HistoryQueries += other.HistoryQueries + ds.TotalQueries += other.TotalQueries ds.EfSearchTime += other.EfSearchTime ds.IndexSize += other.IndexSize ds.DataSize += other.DataSize @@ -126,7 +145,6 @@ type Domain struct { keysTable string // key -> invertedStep , invertedStep = ^(txNum / aggregationStep), Needs to be table with DupSort valsTable string // key + invertedStep -> values stats DomainStats - prefixLen int // Number of bytes in the keys that can be used for prefix iteration mergesCount uint64 } @@ -140,13 +158,11 @@ func NewDomain( historyValsTable string, settingsTable string, indexTable string, - prefixLen int, compressVals bool, ) (*Domain, error) { d := &Domain{ keysTable: keysTable, valsTable: valsTable, - prefixLen: prefixLen, files: btree2.NewBTreeGOptions[*filesItem](filesItemLess, btree2.Options{Degree: 128, NoLocks: false}), roFiles: *atomic2.NewPointer(&[]ctxItem{}), } @@ -158,10 +174,12 @@ func NewDomain( return d, nil } + func (d *Domain) StartWrites() { d.defaultDc = d.MakeContext() d.History.StartWrites() } + func (d *Domain) FinishWrites() { d.defaultDc.Close() d.History.FinishWrites() @@ -297,6 +315,14 @@ func (d *Domain) openFiles() (err error) { } totalKeys += item.index.KeyCount() } + if item.bindex == nil { + bidxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, fromStep, toStep)) + if item.bindex, err = OpenBtreeIndexWithDecompressor(bidxPath, 2048, item.decompressor); err != nil { + log.Debug("InvertedIndex.openFiles: %w, %s", err, bidxPath) + return false + } + //totalKeys += item.bindex.KeyCount() + } } return true }) @@ -328,7 +354,7 @@ func (d *Domain) closeWhatNotInList(fNames []string) { for _, item := range toDelete { if item.decompressor != nil { if err := item.decompressor.Close(); err != nil { - log.Trace("close", "err", err, "file", item.index.FileName()) + log.Trace("close", "err", err, "file", item.decompressor.FileName()) } item.decompressor = nil } @@ -338,6 +364,12 @@ func (d *Domain) closeWhatNotInList(fNames []string) { } item.index = nil } + if item.bindex != nil { + if err := item.bindex.Close(); err != nil { + log.Trace("close", "err", err, "file", item.bindex.FileName()) + } + item.bindex = nil + } d.files.Delete(item) } } @@ -391,6 +423,8 @@ func (d *Domain) Close() { func (dc *DomainContext) get(key []byte, fromTxNum uint64, roTx kv.Tx) ([]byte, bool, error) { //var invertedStep [8]byte + atomic.AddUint64(&dc.d.stats.TotalQueries, 1) + invertedStep := dc.numBuf binary.BigEndian.PutUint64(invertedStep[:], ^(fromTxNum / dc.d.aggregationStep)) keyCursor, err := roTx.CursorDupSort(dc.d.keysTable) @@ -575,7 +609,7 @@ type DomainContext struct { d *Domain files []ctxItem getters []*compress.Getter - readers []*recsplit.IndexReader + readers []*BtIndex hc *HistoryContext keyBuf [60]byte // 52b key and 8b for inverted step numBuf [8]byte @@ -592,17 +626,19 @@ func (dc *DomainContext) statelessGetter(i int) *compress.Getter { } return r } -func (dc *DomainContext) statelessIdxReader(i int) *recsplit.IndexReader { + +func (dc *DomainContext) statelessBtree(i int) *BtIndex { if dc.readers == nil { - dc.readers = make([]*recsplit.IndexReader, len(dc.files)) + dc.readers = make([]*BtIndex, len(dc.files)) } r := dc.readers[i] if r == nil { - r = recsplit.NewIndexReader(dc.files[i].src.index) + r = dc.files[i].src.bindex dc.readers[i] = r } return r } + func (d *Domain) collectFilesStats() (datsz, idxsz, files uint64) { d.History.files.Walk(func(items []*filesItem) bool { for _, item := range items { @@ -623,7 +659,8 @@ func (d *Domain) collectFilesStats() (datsz, idxsz, files uint64) { } datsz += uint64(item.decompressor.Size()) idxsz += uint64(item.index.Size()) - files += 2 + idxsz += uint64(item.bindex.Size()) + files += 3 } return true }) @@ -665,14 +702,12 @@ func (dc *DomainContext) Close() { } // IteratePrefix iterates over key-value pairs of the domain that start with given prefix -// The length of the prefix has to match the `prefixLen` parameter used to create the domain // Such iteration is not intended to be used in public API, therefore it uses read-write transaction // inside the domain. Another version of this for public API use needs to be created, that uses // roTx instead and supports ending the iterations before it reaches the end. func (dc *DomainContext) IteratePrefix(prefix []byte, it func(k, v []byte)) error { - if len(prefix) != dc.d.prefixLen { - return fmt.Errorf("wrong prefix length, this %s domain supports prefixLen %d, given [%x]", dc.d.filenameBase, dc.d.prefixLen, prefix) - } + atomic.AddUint64(&dc.d.stats.HistoryQueries, 1) + var cp CursorHeap heap.Init(&cp) var k, v []byte @@ -697,26 +732,21 @@ func (dc *DomainContext) IteratePrefix(prefix []byte, it func(k, v []byte)) erro heap.Push(&cp, &CursorItem{t: DB_CURSOR, key: common.Copy(k), val: common.Copy(v), c: keysCursor, endTxNum: txNum, reverse: true}) } for i, item := range dc.files { - reader := dc.statelessIdxReader(i) - if reader.Empty() { + bg := dc.statelessBtree(i) + if bg.Empty() { continue } - offset := reader.Lookup(prefix) - // Creating dedicated getter because the one in the item may be used to delete storage, for example - g := dc.statelessGetter(i) - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(prefix); !keyMatch { - continue - } - g.Skip() + + cursor, err := bg.Seek(prefix) + if err != nil { + continue } - if g.HasNext() { - key, _ := g.Next(nil) - if bytes.HasPrefix(key, prefix) { - val, _ := g.Next(nil) - heap.Push(&cp, &CursorItem{t: FILE_CURSOR, key: key, val: val, dg: g, endTxNum: item.endTxNum, reverse: true}) - } + + g := dc.statelessGetter(i) + key := cursor.Key() + if bytes.HasPrefix(key, prefix) { + val := cursor.Value() + heap.Push(&cp, &CursorItem{t: FILE_CURSOR, key: key, val: val, dg: g, endTxNum: item.endTxNum, reverse: true}) } } for cp.Len() > 0 { @@ -785,10 +815,143 @@ func (c Collation) Close() { } } +type kvpair struct { + k, v []byte +} + +func (d *Domain) collator(valuesComp *compress.Compressor, pairs chan kvpair) (count int, err error) { + for kv := range pairs { + if err = valuesComp.AddUncompressedWord(kv.k); err != nil { + return count, fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, kv.k, err) + } + count++ // Only counting keys, not values + if err = valuesComp.AddUncompressedWord(kv.v); err != nil { + return count, fmt.Errorf("add %s values val [%x]=>[%x]: %w", d.filenameBase, kv.k, kv.v, err) + } + } + return count, nil +} + +// collate gathers domain changes over the specified step, using read-only transaction, +// and returns compressors, elias fano, and bitmaps +// [txFrom; txTo) +func (d *Domain) collateStream(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx, logEvery *time.Ticker) (Collation, error) { + started := time.Now() + defer func() { + d.stats.LastCollationTook = time.Since(started) + }() + + hCollation, err := d.History.collate(step, txFrom, txTo, roTx, logEvery) + if err != nil { + return Collation{}, err + } + + var valuesComp *compress.Compressor + closeComp := true + defer func() { + if closeComp { + if valuesComp != nil { + valuesComp.Close() + } + } + }() + + valuesPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, step, step+1)) + if valuesComp, err = compress.NewCompressor(context.Background(), "collate values", valuesPath, d.tmpdir, compress.MinPatternScore, 1, log.LvlTrace); err != nil { + return Collation{}, fmt.Errorf("create %s values compressor: %w", d.filenameBase, err) + } + + keysCursor, err := roTx.CursorDupSort(d.keysTable) + if err != nil { + return Collation{}, fmt.Errorf("create %s keys cursor: %w", d.filenameBase, err) + } + defer keysCursor.Close() + + var ( + k, v []byte + pos uint64 + valCount uint + pairs = make(chan kvpair, 4) + ) + + totalKeys, err := keysCursor.Count() + if err != nil { + return Collation{}, fmt.Errorf("failed to obtain keys count for domain %q", d.filenameBase) + } + + eg, ctx := errgroup.WithContext(ctx) + eg.Go(func() error { + count, err := d.collator(valuesComp, pairs) + if err != nil { + return err + } + valCount = uint(count) + return nil + }) + + for k, _, err = keysCursor.First(); err == nil && k != nil; k, _, err = keysCursor.NextNoDup() { + pos++ + + select { + case <-logEvery.C: + log.Info("[snapshots] collate domain", "name", d.filenameBase, + "range", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep)), + "progress", fmt.Sprintf("%.2f%%", float64(pos)/float64(totalKeys)*100)) + case <-ctx.Done(): + log.Warn("[snapshots] collate domain cancelled", "name", d.filenameBase, "err", ctx.Err()) + close(pairs) + + return Collation{}, err + default: + } + + if v, err = keysCursor.LastDup(); err != nil { + return Collation{}, fmt.Errorf("find last %s key for aggregation step k=[%x]: %w", d.filenameBase, k, err) + } + s := ^binary.BigEndian.Uint64(v) + if s == step { + keySuffix := make([]byte, len(k)+8) + copy(keySuffix, k) + copy(keySuffix[len(k):], v) + + v, err := roTx.GetOne(d.valsTable, keySuffix) + if err != nil { + return Collation{}, fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err) + } + + pairs <- kvpair{k: k, v: v} + } + } + close(pairs) + if err != nil { + return Collation{}, fmt.Errorf("iterate over %s keys cursor: %w", d.filenameBase, err) + } + + if err := eg.Wait(); err != nil { + return Collation{}, fmt.Errorf("collate over %s keys cursor: %w", d.filenameBase, err) + } + + closeComp = false + return Collation{ + valuesPath: valuesPath, + valuesComp: valuesComp, + valuesCount: int(valCount), + historyPath: hCollation.historyPath, + historyComp: hCollation.historyComp, + historyCount: hCollation.historyCount, + indexBitmaps: hCollation.indexBitmaps, + }, nil +} + // collate gathers domain changes over the specified step, using read-only transaction, // and returns compressors, elias fano, and bitmaps // [txFrom; txTo) func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx, logEvery *time.Ticker) (Collation, error) { + started := time.Now() + defer func() { + d.stats.LastCollationTook = time.Since(started) + }() + hCollation, err := d.History.collate(step, txFrom, txTo, roTx, logEvery) if err != nil { return Collation{}, err @@ -814,7 +977,6 @@ func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv defer keysCursor.Close() var ( - prefix []byte // Track prefix to insert it before entries k, v []byte pos uint64 valuesCount uint @@ -849,16 +1011,6 @@ func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv if err != nil { return Collation{}, fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err) } - if d.prefixLen > 0 && (prefix == nil || !bytes.HasPrefix(k, prefix)) { - prefix = append(prefix[:0], k[:d.prefixLen]...) - if err = valuesComp.AddUncompressedWord(prefix); err != nil { - return Collation{}, fmt.Errorf("add %s values prefix [%x]: %w", d.filenameBase, prefix, err) - } - if err = valuesComp.AddUncompressedWord(nil); err != nil { - return Collation{}, fmt.Errorf("add %s values prefix val [%x]: %w", d.filenameBase, prefix, err) - } - valuesCount++ - } if err = valuesComp.AddUncompressedWord(k); err != nil { return Collation{}, fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, k, err) } @@ -886,6 +1038,7 @@ func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv type StaticFiles struct { valuesDecomp *compress.Decompressor valuesIdx *recsplit.Index + valuesBt *BtIndex historyDecomp *compress.Decompressor historyIdx *recsplit.Index efHistoryDecomp *compress.Decompressor @@ -899,6 +1052,9 @@ func (sf StaticFiles) Close() { if sf.valuesIdx != nil { sf.valuesIdx.Close() } + if sf.valuesBt != nil { + sf.valuesBt.Close() + } if sf.historyDecomp != nil { sf.historyDecomp.Close() } @@ -949,16 +1105,28 @@ func (d *Domain) buildFiles(ctx context.Context, step uint64, collation Collatio } valuesComp.Close() valuesComp = nil + if valuesDecomp, err = compress.NewDecompressor(collation.valuesPath); err != nil { return StaticFiles{}, fmt.Errorf("open %s values decompressor: %w", d.filenameBase, err) } if valuesIdx, err = buildIndexThenOpen(ctx, valuesDecomp, valuesIdxPath, d.tmpdir, collation.valuesCount, false); err != nil { return StaticFiles{}, fmt.Errorf("build %s values idx: %w", d.filenameBase, err) } + + btPath := strings.TrimSuffix(valuesIdxPath, "kvi") + "bt" + if err := BuildBtreeIndexWithDecompressor(btPath, valuesDecomp); err != nil { + return StaticFiles{}, fmt.Errorf("build %s values bt idx: %w", d.filenameBase, err) + } + bt, err := OpenBtreeIndexWithDecompressor(btPath, 2048, valuesDecomp) + if err != nil { + return StaticFiles{}, fmt.Errorf("failed to ") + } + closeComp = false return StaticFiles{ valuesDecomp: valuesDecomp, valuesIdx: valuesIdx, + valuesBt: bt, historyDecomp: hStaticFiles.historyDecomp, historyIdx: hStaticFiles.historyIdx, efHistoryDecomp: hStaticFiles.efHistoryDecomp, @@ -1063,20 +1231,26 @@ func (d *Domain) integrateFiles(sf StaticFiles, txNumFrom, txNumTo uint64) { endTxNum: txNumTo, decompressor: sf.valuesDecomp, index: sf.valuesIdx, + bindex: sf.valuesBt, }) + //d.defaultDc.Close() + //d.defaultDc = d.MakeContext() d.reCalcRoFiles() } // [txFrom; txTo) func (d *Domain) prune(ctx context.Context, step uint64, txFrom, txTo, limit uint64, logEvery *time.Ticker) error { - // It is important to clean up tables in a specific order - // First keysTable, because it is the first one access in the `get` function, i.e. if the record is canDelete from there, other tables will not be accessed + start := time.Now() + defer func() { + d.stats.LastPruneTook = time.Since(start) + }() + keysCursor, err := d.tx.RwCursorDupSort(d.keysTable) if err != nil { return fmt.Errorf("%s keys cursor: %w", d.filenameBase, err) } defer keysCursor.Close() - var k, v []byte + var k, v, stepBytes []byte keyMaxSteps := make(map[string]uint64) for k, v, err = keysCursor.First(); err == nil && k != nil; k, v, err = keysCursor.Next() { @@ -1087,50 +1261,56 @@ func (d *Domain) prune(ctx context.Context, step uint64, txFrom, txTo, limit uin log.Warn("[snapshots] prune domain cancelled", "name", d.filenameBase, "err", ctx.Err()) return err default: - } - - s := ^binary.BigEndian.Uint64(v) - if maxS, seen := keyMaxSteps[string(k)]; !seen || s > maxS { - keyMaxSteps[string(k)] = s - } - } - if err != nil { - return fmt.Errorf("iterate of %s keys: %w", d.filenameBase, err) - } - - for k, v, err = keysCursor.First(); err == nil && k != nil; k, v, err = keysCursor.Next() { - select { - case <-logEvery.C: - log.Info("[snapshots] prune domain", "name", d.filenameBase, "stage", "prune keys", "range", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep))) - case <-ctx.Done(): - log.Warn("[snapshots] prune domain cancelled", "name", d.filenameBase, "err", ctx.Err()) - return err - default: - } - - s := ^binary.BigEndian.Uint64(v) - if s == step { - if maxS := keyMaxSteps[string(k)]; maxS <= step { - continue - } - if err = keysCursor.DeleteCurrent(); err != nil { - return fmt.Errorf("clean up %s for [%x]=>[%x]: %w", d.filenameBase, k, v, err) + s := ^binary.BigEndian.Uint64(v) + if maxS, seen := keyMaxSteps[string(k)]; !seen || s > maxS { + keyMaxSteps[string(k)] = s } - - if bytes.HasPrefix(k, keyCommitmentState) { - fmt.Printf("domain prune key %x [s%d] txn=%d\n", string(k), s, ^binary.BigEndian.Uint64(v)) + if len(stepBytes) == 0 && step == s { + stepBytes = common.Copy(v) } } } if err != nil { return fmt.Errorf("iterate of %s keys: %w", d.filenameBase, err) } + + // It is important to clean up tables in a specific order + // First keysTable, because it is the first one access in the `get` function, i.e. if the record is deleted from there, other tables will not be accessed + //for k, v := range keyMaxSteps { + // + //} + //for k, v, err = keysCursor.First(); err == nil && k != nil; k, v, err = keysCursor.Next() { + // select { + // case <-logEvery.C: + // log.Info("[snapshots] prune domain", "name", d.filenameBase, "stage", "prune keys", "range", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep))) + // case <-ctx.Done(): + // log.Warn("[snapshots] prune domain cancelled", "name", d.filenameBase, "err", ctx.Err()) + // return err + // default: + // if bytes.Equal(stepBytes, v) { + // if maxS := keyMaxSteps[string(k)]; maxS <= step { + // continue + // } + // if err = keysCursor.DeleteCurrent(); err != nil { + // return fmt.Errorf("clean up %s for [%x]=>[%x]: %w", d.filenameBase, k, v, err) + // } + // } + // } + //} + //if err != nil { + // return fmt.Errorf("iterate of %s keys: %w", d.filenameBase, err) + //} var valsCursor kv.RwCursor if valsCursor, err = d.tx.RwCursor(d.valsTable); err != nil { return fmt.Errorf("%s vals cursor: %w", d.filenameBase, err) } defer valsCursor.Close() - for k, _, err = valsCursor.First(); err == nil && k != nil; k, _, err = valsCursor.Next() { + //for k, _, err = valsCursor.First(); err == nil && k != nil; k, _, err = valsCursor.Next() { + for k, s := range keyMaxSteps { + if s <= step { + continue + } + select { case <-logEvery.C: log.Info("[snapshots] prune domain", "name", d.filenameBase, "stage", "prune values", "range", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep))) @@ -1138,16 +1318,17 @@ func (d *Domain) prune(ctx context.Context, step uint64, txFrom, txTo, limit uin log.Warn("[snapshots] prune domain cancelled", "name", d.filenameBase, "err", ctx.Err()) return err default: - } - s := ^binary.BigEndian.Uint64(k[len(k)-8:]) - if s == step { - if maxS := keyMaxSteps[string(k[:len(k)-8])]; maxS <= step { - continue + //if bytes.Equal(stepBytes, k[len(k)-8:]) { + //if maxS := keyMaxSteps[string(k[len(k)-8:])]; maxS <= step { + // continue + //} + if err = keysCursor.DeleteExact([]byte(k), stepBytes); err != nil { + return fmt.Errorf("clean up key %s for [%x]: %w", d.filenameBase, k, err) } - if err = valsCursor.DeleteCurrent(); err != nil { + if err = valsCursor.Delete([]byte(k)); err != nil { return fmt.Errorf("clean up %s for [%x]: %w", d.filenameBase, k, err) } - //fmt.Printf("domain prune value for %x (invs %x) [s%d]\n", string(k),k[len(k)-8):], s) + //} } } if err != nil { @@ -1214,19 +1395,20 @@ func (dc *DomainContext) readFromFiles(filekey []byte, fromTxNum uint64) ([]byte if dc.files[i].endTxNum < fromTxNum { break } - reader := dc.statelessIdxReader(i) + reader := dc.statelessBtree(i) if reader.Empty() { continue } - offset := reader.Lookup(filekey) - g := dc.statelessGetter(i) - g.Reset(offset) - if g.HasNext() { - if keyMatch, _ := g.Match(filekey); keyMatch { - val, _ = g.Next(nil) - found = true - break - } + cur, err := reader.Seek(filekey) + if err != nil { + log.Warn("failed to read from file", "file", reader.FileName(), "err", err) + continue + } + + if bytes.Equal(cur.Key(), filekey) { + val = cur.Value() + found = true + break } } return val, found @@ -1235,6 +1417,8 @@ func (dc *DomainContext) readFromFiles(filekey []byte, fromTxNum uint64) ([]byte // historyBeforeTxNum searches history for a value of specified key before txNum // second return value is true if the value is found in the history (even if it is nil) func (dc *DomainContext) historyBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx) ([]byte, bool, error) { + atomic.AddUint64(&dc.d.stats.HistoryQueries, 1) + var foundTxNum uint64 var foundEndTxNum uint64 var foundStartTxNum uint64 @@ -1282,22 +1466,19 @@ func (dc *DomainContext) historyBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx if dc.files[i].startTxNum > topState.startTxNum { continue } - reader := dc.statelessIdxReader(i) + reader := dc.statelessBtree(i) if reader.Empty() { continue } - offset := reader.Lookup(key) - g := dc.statelessGetter(i) - g.Reset(offset) - if g.HasNext() { - if k, _ := g.NextUncompressed(); bytes.Equal(k, key) { - if dc.d.compressVals { - val, _ = g.Next(nil) - } else { - val, _ = g.NextUncompressed() - } - break - } + cur, err := reader.Seek(key) + if err != nil { + log.Warn("failed to read history before from file", "key", key, "err", err) + continue + } + + if bytes.Equal(cur.Key(), key) { + val = cur.Value() + break } } return val, true, nil diff --git a/state/domain_committed.go b/state/domain_committed.go index 1045293a8..7f44fa907 100644 --- a/state/domain_committed.go +++ b/state/domain_committed.go @@ -24,6 +24,7 @@ import ( "fmt" "hash" "path/filepath" + "strings" "github.com/google/btree" "github.com/ledgerwatch/log/v3" @@ -33,7 +34,6 @@ import ( "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/length" "github.com/ledgerwatch/erigon-lib/compress" - "github.com/ledgerwatch/erigon-lib/recsplit" ) // Defines how to evaluate commitments @@ -45,6 +45,19 @@ const ( CommitmentModeUpdate CommitmentMode = 2 ) +func (m CommitmentMode) String() string { + switch m { + case CommitmentModeDisabled: + return "disabled" + case CommitmentModeDirect: + return "direct" + case CommitmentModeUpdate: + return "update" + default: + return "unknown" + } +} + type ValueMerger func(prev, current []byte) (merged []byte, err error) type DomainCommitted struct { @@ -53,15 +66,14 @@ type DomainCommitted struct { trace bool commTree *btree.BTreeG[*CommitmentItem] keccak hash.Hash - patriciaTrie *commitment.HexPatriciaHashed - keyReplaceFn ValueMerger // defines logic performed with stored values during files merge + patriciaTrie commitment.Trie branchMerger *commitment.BranchMerger } -func NewCommittedDomain(d *Domain, mode CommitmentMode) *DomainCommitted { +func NewCommittedDomain(d *Domain, mode CommitmentMode, trieVariant commitment.TrieVariant) *DomainCommitted { return &DomainCommitted{ Domain: d, - patriciaTrie: commitment.NewHexPatriciaHashed(length.Addr, nil, nil, nil), + patriciaTrie: commitment.InitializeTrie(trieVariant), commTree: btree.NewG[*CommitmentItem](32, commitmentItemLess), keccak: sha3.NewLegacyKeccak256(), mode: mode, @@ -69,8 +81,6 @@ func NewCommittedDomain(d *Domain, mode CommitmentMode) *DomainCommitted { } } -func (d *DomainCommitted) SetKeyReplacer(vm ValueMerger) { d.keyReplaceFn = vm } - func (d *DomainCommitted) SetCommitmentMode(m CommitmentMode) { d.mode = m } // TouchPlainKey marks plainKey as updated and applies different fn for different key types @@ -192,9 +202,17 @@ func (d *DomainCommitted) hashAndNibblizeKey(key []byte) []byte { } func (d *DomainCommitted) storeCommitmentState(blockNum, txNum uint64) error { - state, err := d.patriciaTrie.EncodeCurrentState(nil) - if err != nil { - return err + var state []byte + var err error + + switch trie := (d.patriciaTrie).(type) { + case *commitment.HexPatriciaHashed: + state, err = trie.EncodeCurrentState(nil) + if err != nil { + return err + } + default: + return fmt.Errorf("unsupported state storing for patricia trie type: %T", d.patriciaTrie) } cs := &commitmentState{txNum: txNum, trieState: state, blockNum: blockNum} encoded, err := cs.Encode() @@ -216,54 +234,44 @@ func (d *DomainCommitted) replaceKeyWithReference(fullKey, shortKey []byte, type numBuf := [2]byte{} var found bool for _, item := range list { - g := item.decompressor.MakeGetter() - index := recsplit.NewIndexReader(item.index) + //g := item.decompressor.MakeGetter() + //index := recsplit.NewIndexReader(item.index) - offset := index.Lookup(fullKey) - g.Reset(offset) - if !g.HasNext() { + cur, err := item.bindex.Seek(fullKey) + if err != nil { continue } - if keyMatch, _ := g.Match(fullKey); keyMatch { - step := uint16(item.endTxNum / d.aggregationStep) - binary.BigEndian.PutUint16(numBuf[:], step) + step := uint16(item.endTxNum / d.aggregationStep) + binary.BigEndian.PutUint16(numBuf[:], step) - shortKey = encodeU64(offset, numBuf[:]) + shortKey = encodeU64(cur.Ordinal(), numBuf[:]) - if d.trace { - fmt.Printf("replacing %s [%x] => {%x} [step=%d, offset=%d, file=%s.%d-%d]\n", typeAS, fullKey, shortKey, step, offset, typeAS, item.startTxNum, item.endTxNum) - } - found = true - break + if d.trace { + fmt.Printf("replacing %s [%x] => {%x} [step=%d, offset=%d, file=%s.%d-%d]\n", typeAS, fullKey, shortKey, step, cur.Ordinal(), typeAS, item.startTxNum, item.endTxNum) } + found = true + break } + //if !found { + // log.Warn("bt index key replacement seek failed", "key", fmt.Sprintf("%x", fullKey)) + //} return found } +// nolint func (d *DomainCommitted) lookupShortenedKey(shortKey, fullKey []byte, typAS string, list []*filesItem) bool { fileStep, offset := shortenedKey(shortKey) expected := uint64(fileStep) * d.aggregationStep - var size uint64 - switch typAS { - case "account": - size = length.Addr - case "storage": - size = length.Addr + length.Hash - default: - return false - } var found bool for _, item := range list { if item.startTxNum > expected || item.endTxNum < expected { continue } - g := item.decompressor.MakeGetter() - if uint64(g.Size()) <= offset+size { - continue - } - g.Reset(offset) - fullKey, _ = g.Next(fullKey[:0]) + + cur := item.bindex.OrdinalLookup(offset) + //nolint + fullKey = cur.Key() if d.trace { fmt.Printf("offsetToKey %s [%x]=>{%x} step=%d offset=%d, file=%s.%d-%d.kv\n", typAS, fullKey, shortKey, fileStep, offset, typAS, item.startTxNum, item.endTxNum) } @@ -348,6 +356,9 @@ func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStati if indexIn.index != nil { indexIn.index.Close() } + if indexIn.bindex != nil { + indexIn.bindex.Close() + } } if historyIn != nil { if historyIn.decompressor != nil { @@ -356,6 +367,9 @@ func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStati if historyIn.index != nil { historyIn.index.Close() } + if historyIn.bindex != nil { + historyIn.bindex.Close() + } } if valuesIn != nil { if valuesIn.decompressor != nil { @@ -364,6 +378,9 @@ func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStati if valuesIn.index != nil { valuesIn.index.Close() } + if valuesIn.bindex != nil { + valuesIn.bindex.Close() + } } } }() @@ -433,38 +450,10 @@ func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStati heap.Pop(&cp) } } - var skip bool - if d.prefixLen > 0 { - skip = r.valuesStartTxNum == 0 && len(lastVal) == 0 && len(lastKey) != d.prefixLen - } else { - // For the rest of types, empty value means deletion - skip = r.valuesStartTxNum == 0 && len(lastVal) == 0 - } + // For the rest of types, empty value means deletion + skip := r.valuesStartTxNum == 0 && len(lastVal) == 0 + //} if !skip { - if keyBuf != nil && (d.prefixLen == 0 || len(keyBuf) != d.prefixLen || bytes.HasPrefix(lastKey, keyBuf)) { - if err = comp.AddUncompressedWord(keyBuf); err != nil { - return nil, nil, nil, err - } - keyCount++ // Only counting keys, not values - - if d.trace { - fmt.Printf("merge: multi-way key %x, total keys %d\n", keyBuf, keyCount) - } - - valBuf, err = d.commitmentValTransform(&oldFiles, &mergedFiles, valBuf) - if err != nil { - return nil, nil, nil, fmt.Errorf("merge: valTransform [%x] %w", valBuf, err) - } - if d.compressVals { - if err = comp.AddWord(valBuf); err != nil { - return nil, nil, nil, err - } - } else { - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, nil, nil, err - } - } - } keyBuf = append(keyBuf[:0], lastKey...) valBuf = append(valBuf[:0], lastVal...) } @@ -502,6 +491,18 @@ func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStati if valuesIn.index, err = buildIndexThenOpen(ctx, valuesIn.decompressor, idxPath, d.dir, keyCount, false /* values */); err != nil { return nil, nil, nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) } + + btPath := strings.TrimSuffix(idxPath, "kvi") + "bt" + err = BuildBtreeIndexWithDecompressor(btPath, valuesIn.decompressor) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s btindex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + + bt, err := OpenBtreeIndexWithDecompressor(btPath, 2048, valuesIn.decompressor) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s btindex2 [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + valuesIn.bindex = bt } closeItem = false d.stats.MergesCount++ @@ -543,14 +544,21 @@ var keyCommitmentState = []byte("state") // SeekCommitment searches for last encoded state from DomainCommitted // and if state found, sets it up to current domain func (d *DomainCommitted) SeekCommitment(aggStep, sinceTx uint64) (uint64, error) { + if d.patriciaTrie.Variant() != commitment.VariantHexPatriciaTrie { + return 0, fmt.Errorf("state storing is only supported hex patricia trie") + } + // todo add support of bin state dumping + var ( latestState []byte stepbuf [2]byte - step uint16 = uint16(sinceTx/aggStep) - 1 + step = uint16(sinceTx/aggStep) - 1 latestTxNum uint64 = sinceTx - 1 ) + d.SetTxNum(latestTxNum) ctx := d.MakeContext() + defer ctx.Close() for { binary.BigEndian.PutUint16(stepbuf[:], step) @@ -567,7 +575,7 @@ func (d *DomainCommitted) SeekCommitment(aggStep, sinceTx uint64) (uint64, error break } latestTxNum, latestState = v, s - lookupTxN := latestTxNum + aggStep // - 1 + lookupTxN := latestTxNum + aggStep step = uint16(latestTxNum/aggStep) + 1 d.SetTxNum(lookupTxN) } @@ -577,9 +585,14 @@ func (d *DomainCommitted) SeekCommitment(aggStep, sinceTx uint64) (uint64, error return 0, nil } - if err := d.patriciaTrie.SetState(latest.trieState); err != nil { - return 0, err + if hext, ok := d.patriciaTrie.(*commitment.HexPatriciaHashed); ok { + if err := hext.SetState(latest.trieState); err != nil { + return 0, err + } + } else { + return 0, fmt.Errorf("state storing is only supported hex patricia trie") } + return latest.txNum, nil } diff --git a/state/domain_test.go b/state/domain_test.go index efd58128f..e14c34fc2 100644 --- a/state/domain_test.go +++ b/state/domain_test.go @@ -35,10 +35,9 @@ import ( "github.com/ledgerwatch/erigon-lib/recsplit" ) -func testDbAndDomain(t *testing.T, prefixLen int) (string, kv.RwDB, *Domain) { +func testDbAndDomain(t *testing.T) (string, kv.RwDB, *Domain) { t.Helper() path := t.TempDir() - t.Cleanup(func() { os.RemoveAll(path) }) logger := log.New() keysTable := "Keys" valsTable := "Vals" @@ -57,17 +56,19 @@ func testDbAndDomain(t *testing.T, prefixLen int) (string, kv.RwDB, *Domain) { } }).MustOpen() t.Cleanup(db.Close) - d, err := NewDomain(path, path, 16 /* aggregationStep */, "base" /* filenameBase */, keysTable, valsTable, historyKeysTable, historyValsTable, settingsTable, indexTable, prefixLen, true /* compressVals */) + d, err := NewDomain(path, path, 16 /* aggregationStep */, "base" /* filenameBase */, keysTable, valsTable, historyKeysTable, historyValsTable, settingsTable, indexTable, true /* compressVals */) require.NoError(t, err) t.Cleanup(d.Close) return path, db, d } +// btree index should work correctly if K < m func TestCollationBuild(t *testing.T) { logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d := testDbAndDomain(t, 0 /* prefixLen */) + _, db, d := testDbAndDomain(t) ctx := context.Background() + defer d.Close() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -92,6 +93,7 @@ func TestCollationBuild(t *testing.T) { require.NoError(t, err) c, err := d.collate(ctx, 0, 0, 7, tx, logEvery) + require.NoError(t, err) require.True(t, strings.HasSuffix(c.valuesPath, "base.0-1.kv")) require.Equal(t, 2, c.valuesCount) @@ -104,6 +106,8 @@ func TestCollationBuild(t *testing.T) { sf, err := d.buildFiles(ctx, 0, c) require.NoError(t, err) defer sf.Close() + c.Close() + g := sf.valuesDecomp.MakeGetter() g.Reset(0) var words []string @@ -114,7 +118,9 @@ func TestCollationBuild(t *testing.T) { require.Equal(t, []string{"key1", "value1.2", "key2", "value2.1"}, words) // Check index require.Equal(t, 2, int(sf.valuesIdx.KeyCount())) + r := recsplit.NewIndexReader(sf.valuesIdx) + defer r.Close() for i := 0; i < len(words); i += 2 { offset := r.Lookup([]byte(words[i])) g.Reset(offset) @@ -126,7 +132,7 @@ func TestCollationBuild(t *testing.T) { } func TestIterationBasic(t *testing.T) { - _, db, d := testDbAndDomain(t, 5 /* prefixLen */) + _, db, d := testDbAndDomain(t) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -166,7 +172,7 @@ func TestIterationBasic(t *testing.T) { func TestAfterPrune(t *testing.T) { logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d := testDbAndDomain(t, 0 /* prefixLen */) + _, db, d := testDbAndDomain(t) ctx := context.Background() tx, err := db.BeginRw(ctx) @@ -240,7 +246,7 @@ func TestAfterPrune(t *testing.T) { func filledDomain(t *testing.T) (string, kv.RwDB, *Domain, uint64) { t.Helper() - path, db, d := testDbAndDomain(t, 0 /* prefixLen */) + path, db, d := testDbAndDomain(t) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -347,7 +353,7 @@ func TestHistory(t *testing.T) { func TestIterationMultistep(t *testing.T) { logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d := testDbAndDomain(t, 5 /* prefixLen */) + _, db, d := testDbAndDomain(t) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -503,20 +509,14 @@ func TestScanFiles(t *testing.T) { txNum := d.txNum d.closeWhatNotInList([]string{}) d.OpenFolder() - //d.Close() - // - //var err error - //d, err = NewDomain(path, path, d.aggregationStep, d.filenameBase, d.keysTable, d.valsTable, d.indexKeysTable, d.historyValsTable, d.settingsTable, d.indexTable, d.prefixLen, d.compressVals) - //require.NoError(t, err) - //require.NoError(t, d.OpenFolder()) - //defer d.Close() + d.SetTxNum(txNum) // Check the history checkHistory(t, db, d, txs) } func TestDelete(t *testing.T) { - _, db, d := testDbAndDomain(t, 0 /* prefixLen */) + _, db, d := testDbAndDomain(t) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -558,7 +558,7 @@ func TestDelete(t *testing.T) { func filledDomainFixedSize(t *testing.T, keysCount, txCount uint64) (string, kv.RwDB, *Domain, map[string][]bool) { t.Helper() - path, db, d := testDbAndDomain(t, 0 /* prefixLen */) + path, db, d := testDbAndDomain(t) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -657,7 +657,7 @@ func TestDomain_Prune_AfterAllWrites(t *testing.T) { func TestDomain_PruneOnWrite(t *testing.T) { keysCount, txCount := uint64(16), uint64(64) - path, db, d := testDbAndDomain(t, 0 /* prefixLen */) + path, db, d := testDbAndDomain(t) ctx := context.Background() defer os.Remove(path) diff --git a/state/history.go b/state/history.go index 8ea1b1604..e21d480cd 100644 --- a/state/history.go +++ b/state/history.go @@ -30,6 +30,12 @@ import ( "time" "github.com/RoaringBitmap/roaring/roaring64" + "github.com/ledgerwatch/log/v3" + btree2 "github.com/tidwall/btree" + atomic2 "go.uber.org/atomic" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/cmp" "github.com/ledgerwatch/erigon-lib/common/dir" @@ -40,11 +46,6 @@ import ( "github.com/ledgerwatch/erigon-lib/kv/order" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" - "github.com/ledgerwatch/log/v3" - btree2 "github.com/tidwall/btree" - atomic2 "go.uber.org/atomic" - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" ) type History struct { diff --git a/state/locality_index.go b/state/locality_index.go index 62be22dc0..892c2b717 100644 --- a/state/locality_index.go +++ b/state/locality_index.go @@ -27,11 +27,13 @@ import ( "strconv" "time" + "github.com/ledgerwatch/log/v3" + "github.com/ledgerwatch/erigon-lib/common/assert" "github.com/ledgerwatch/erigon-lib/common/dir" "github.com/ledgerwatch/erigon-lib/kv/bitmapdb" "github.com/ledgerwatch/erigon-lib/recsplit" - "github.com/ledgerwatch/log/v3" + atomic2 "go.uber.org/atomic" ) diff --git a/state/merge.go b/state/merge.go index 7d8c2b19b..9c448514a 100644 --- a/state/merge.go +++ b/state/merge.go @@ -25,12 +25,13 @@ import ( "path/filepath" "strings" + "github.com/ledgerwatch/log/v3" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/cmp" "github.com/ledgerwatch/erigon-lib/compress" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" - "github.com/ledgerwatch/log/v3" ) func (d *Domain) endTxNumMinimax() uint64 { @@ -104,6 +105,26 @@ type DomainRanges struct { index bool } +func (r DomainRanges) String() string { + var b strings.Builder + if r.values { + b.WriteString(fmt.Sprintf("Values: [%d, %d)", r.valuesStartTxNum, r.valuesEndTxNum)) + } + if r.history { + if b.Len() > 0 { + b.WriteString(", ") + } + b.WriteString(fmt.Sprintf("History: [%d, %d)", r.historyStartTxNum, r.historyEndTxNum)) + } + if r.index { + if b.Len() > 0 { + b.WriteString(", ") + } + b.WriteString(fmt.Sprintf("Index: [%d, %d)", r.indexStartTxNum, r.indexEndTxNum)) + } + return b.String() +} + func (r DomainRanges) any() bool { return r.values || r.history || r.index } @@ -142,93 +163,6 @@ func (d *Domain) findMergeRange(maxEndTxNum, maxSpan uint64) DomainRanges { return r } -// nolint -type mergedDomainFiles struct { - values *filesItem - index *filesItem - history *filesItem -} - -// nolint -func (m *mergedDomainFiles) Close() { - for _, item := range []*filesItem{ - m.values, m.index, m.history, - } { - if item != nil { - if item.decompressor != nil { - item.decompressor.Close() - } - if item.decompressor != nil { - item.index.Close() - } - } - } -} - -// nolint -type staticFilesInRange struct { - valuesFiles []*filesItem - indexFiles []*filesItem - historyFiles []*filesItem - startJ int -} - -// nolint -func (s *staticFilesInRange) Close() { - for _, group := range [][]*filesItem{ - s.valuesFiles, s.indexFiles, s.historyFiles, - } { - for _, item := range group { - if item != nil { - if item.decompressor != nil { - item.decompressor.Close() - } - if item.index != nil { - item.index.Close() - } - } - } - } -} - -/* -// nolint - - func (d *Domain) mergeRangesUpTo(ctx context.Context, maxTxNum, maxSpan uint64, workers int) (err error) { - closeAll := true - for rng := d.findMergeRange(maxSpan, maxTxNum); rng.any(); rng = d.findMergeRange(maxTxNum, maxSpan) { - var sfr staticFilesInRange - sfr.valuesFiles, sfr.indexFiles, sfr.historyFiles, sfr.startJ = d.staticFilesInRange(rng) - defer func() { - if closeAll { - sfr.Close() - } - }() - - var mf mergedDomainFiles - if mf.values, mf.index, mf.history, err = d.mergeFiles(ctx, sfr.valuesFiles, sfr.indexFiles, sfr.historyFiles, rng, workers); err != nil { - return err - } - defer func() { - if closeAll { - mf.Close() - } - }() - - //defer func(t time.Time) { log.Info("[snapshots] merge", "took", time.Since(t)) }(time.Now()) - d.integrateMergedFiles(sfr.valuesFiles, sfr.indexFiles, sfr.historyFiles, mf.values, mf.index, mf.history) - - if err := d.deleteFiles(sfr.valuesFiles, sfr.indexFiles, sfr.historyFiles); err != nil { - return err - } - - log.Info(fmt.Sprintf("domain files mergedRange[%d, %d) name=%s span=%d \n", rng.valuesStartTxNum, rng.valuesEndTxNum, d.filenameBase, maxSpan)) - } - closeAll = false - return nil - } -*/ - // 0-1,1-2,2-3,3-4: allow merge 0-1 // 0-2,2-3,3-4: allow merge 0-4 // 0-2,2-4: allow merge 0-4 @@ -266,12 +200,10 @@ func (ii *InvertedIndex) findMergeRange(maxEndTxNum, maxSpan uint64) (bool, uint return minFound, startTxNum, endTxNum } -/* -// nolint -func (ii *InvertedIndex) mergeRangesUpTo(ctx context.Context, maxTxNum, maxSpan uint64, workers int) (err error) { +func (ii *InvertedIndex) mergeRangesUpTo(ctx context.Context, maxTxNum, maxSpan uint64, workers int, ictx *InvertedIndexContext) (err error) { closeAll := true for updated, startTx, endTx := ii.findMergeRange(maxSpan, maxTxNum); updated; updated, startTx, endTx = ii.findMergeRange(maxTxNum, maxSpan) { - staticFiles, startJ := ii.staticFilesInRange(startTx, endTx) + staticFiles, _ := ii.staticFilesInRange(startTx, endTx, ictx) defer func() { if closeAll { for _, i := range staticFiles { @@ -280,7 +212,6 @@ func (ii *InvertedIndex) mergeRangesUpTo(ctx context.Context, maxTxNum, maxSpan } } }() - _ = startJ mergedIndex, err := ii.mergeFiles(ctx, staticFiles, startTx, endTx, workers) if err != nil { @@ -293,19 +224,12 @@ func (ii *InvertedIndex) mergeRangesUpTo(ctx context.Context, maxTxNum, maxSpan } }() - //defer func(t time.Time) { log.Info("[snapshots] merge", "took", time.Since(t)) }(time.Now()) ii.integrateMergedFiles(staticFiles, mergedIndex) - - if err := ii.deleteFiles(staticFiles); err != nil { - return err - } - - log.Info(fmt.Sprintf("domain files mergedRange[%d, %d) name=%s span=%d \n", startTx, endTx, ii.filenameBase, maxSpan)) + ii.cleanFrozenParts(mergedIndex) } closeAll = false return nil } -*/ type HistoryRanges struct { historyStartTxNum uint64 @@ -552,16 +476,13 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor return } var comp *compress.Compressor - //var decomp *compress.Decompressor - var closeItem = true + closeItem := true + defer func() { if closeItem { if comp != nil { comp.Close() } - //if decomp != nil { - // decomp.Close() - //} if indexIn != nil { if indexIn.decompressor != nil { indexIn.decompressor.Close() @@ -569,6 +490,9 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor if indexIn.index != nil { indexIn.index.Close() } + if indexIn.bindex != nil { + indexIn.bindex.Close() + } } if historyIn != nil { if historyIn.decompressor != nil { @@ -577,6 +501,9 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor if historyIn.index != nil { historyIn.index.Close() } + if historyIn.bindex != nil { + historyIn.bindex.Close() + } } if valuesIn != nil { if valuesIn.decompressor != nil { @@ -585,6 +512,9 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor if valuesIn.index != nil { valuesIn.index.Close() } + if valuesIn.bindex != nil { + valuesIn.bindex.Close() + } } } }() @@ -656,24 +586,21 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor heap.Pop(&cp) } } - var skip bool - if d.prefixLen > 0 { - skip = r.valuesStartTxNum == 0 && len(lastVal) == 0 && len(lastKey) != d.prefixLen - } else { - // For the rest of types, empty value means deletion - skip = r.valuesStartTxNum == 0 && len(lastVal) == 0 - } - if !skip { - if keyBuf != nil && (d.prefixLen == 0 || len(keyBuf) != d.prefixLen || bytes.HasPrefix(lastKey, keyBuf)) { + + // empty value means deletion + deleted := r.valuesStartTxNum == 0 && len(lastVal) == 0 + if !deleted { + if keyBuf != nil { if err = comp.AddUncompressedWord(keyBuf); err != nil { return nil, nil, nil, err } keyCount++ // Only counting keys, not values - if d.compressVals { + switch d.compressVals { + case true: if err = comp.AddWord(valBuf); err != nil { return nil, nil, nil, err } - } else { + default: if err = comp.AddUncompressedWord(valBuf); err != nil { return nil, nil, nil, err } @@ -713,6 +640,18 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor if valuesIn.index, err = buildIndexThenOpen(ctx, valuesIn.decompressor, idxPath, d.tmpdir, keyCount, false /* values */); err != nil { return nil, nil, nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) } + + btPath := strings.TrimSuffix(idxPath, "kvi") + "bt" + err = BuildBtreeIndexWithDecompressor(btPath, valuesIn.decompressor) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s btindex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + + bt, err := OpenBtreeIndexWithDecompressor(btPath, 2048, valuesIn.decompressor) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s btindex2 [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + valuesIn.bindex = bt } closeItem = false d.stats.MergesCount++ @@ -1057,7 +996,7 @@ func (d *Domain) integrateMergedFiles(valuesOuts, indexOuts, historyOuts []*file // `kill -9` may leave some garbage // but it still may be useful for merges, until we finish merge frozen file - if historyIn.frozen { + if historyIn != nil && historyIn.frozen { d.files.Walk(func(items []*filesItem) bool { for _, item := range items { if item.frozen || item.endTxNum > valuesIn.endTxNum {