Skip to content

Commit

Permalink
additional limit for effective dict
Browse files Browse the repository at this point in the history
  • Loading branch information
stevemilk committed Sep 25, 2024
1 parent b6e0822 commit c02a1ab
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 7 deletions.
5 changes: 4 additions & 1 deletion erigon-lib/seg/compress.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ type Cfg struct {
SamplingFactor uint64

Workers int

EffectiveDictLimit int
}

var DefaultCfg = Cfg{
Expand All @@ -82,7 +84,8 @@ var DefaultCfg = Cfg{
SamplingFactor: 4,
MaxDictPatterns: 64 * 1024,

DictReducerSoftLimit: 1_000_000,
DictReducerSoftLimit: 2_000_000,
EffectiveDictLimit: 1_000_000,

Workers: 1,
}
Expand Down
18 changes: 13 additions & 5 deletions erigon-lib/seg/parallel_compress.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import (
"github.com/erigontech/erigon-lib/seg/sais"
)

func coverWordByPatterns(trace bool, input []byte, mf2 *patricia.MatchFinder2, output []byte, uncovered []int, patterns []int, cellRing *Ring, posMap map[uint64]uint64) ([]byte, []int, []int) {
func coverWordByPatterns(trace bool, input []byte, mf2 *patricia.MatchFinder2, output []byte, uncovered []int, patterns []int, cellRing *Ring, posMap map[uint64]uint64, usedPatterns map[uint64]bool, effectiveDictLimit int) ([]byte, []int, []int) {
matches := mf2.FindLongestMatches(input)

if len(matches) == 0 {
Expand Down Expand Up @@ -68,6 +68,11 @@ func coverWordByPatterns(trace bool, input []byte, mf2 *patricia.MatchFinder2, o
for i := len(matches); i > 0; i-- {
f := matches[i-1]
p := f.Val.(*Pattern)

if !usedPatterns[p.code] && len(usedPatterns) >= effectiveDictLimit {
continue
}

firstCell := cellRing.Get(0)
maxCompression := firstCell.compression
maxScore := firstCell.score
Expand Down Expand Up @@ -125,6 +130,7 @@ func coverWordByPatterns(trace bool, input []byte, mf2 *patricia.MatchFinder2, o
d.coverStart = maxCell.coverStart
d.patternIdx = maxCell.patternIdx
}
usedPatterns[p.code] = true
}
optimCell := cellRing.Get(0)
if trace {
Expand Down Expand Up @@ -178,7 +184,7 @@ func coverWordByPatterns(trace bool, input []byte, mf2 *patricia.MatchFinder2, o
return output, patterns, uncovered
}

func coverWordsByPatternsWorker(trace bool, inputCh chan *CompressionWord, outCh chan *CompressionWord, completion *sync.WaitGroup, trie *patricia.PatriciaTree, inputSize, outputSize *atomic.Uint64, posMap map[uint64]uint64) {
func coverWordsByPatternsWorker(trace bool, inputCh chan *CompressionWord, outCh chan *CompressionWord, completion *sync.WaitGroup, trie *patricia.PatriciaTree, inputSize, outputSize *atomic.Uint64, posMap map[uint64]uint64, usedPatterns map[uint64]bool, effectiveDictLimit int) {
defer completion.Done()
var output = make([]byte, 0, 256)
var uncovered = make([]int, 256)
Expand All @@ -190,7 +196,7 @@ func coverWordsByPatternsWorker(trace bool, inputCh chan *CompressionWord, outCh
wordLen := uint64(len(compW.word))
n := binary.PutUvarint(numBuf[:], wordLen)
output = append(output[:0], numBuf[:n]...) // Prepend with the encoding of length
output, patterns, uncovered = coverWordByPatterns(trace, compW.word, mf2, output, uncovered, patterns, cellRing, posMap)
output, patterns, uncovered = coverWordByPatterns(trace, compW.word, mf2, output, uncovered, patterns, cellRing, posMap, usedPatterns, effectiveDictLimit)
compW.word = append(compW.word[:0], output...)
outCh <- compW
inputSize.Add(1 + wordLen)
Expand Down Expand Up @@ -271,6 +277,8 @@ func compressWithPatternCandidates(ctx context.Context, trace bool, cfg Cfg, log
heap.Init(&compressionQueue)
queueLimit := 128 * 1024

var usedPatterns = make(map[uint64]bool)

// For the case of workers == 1
var output = make([]byte, 0, 256)
var uncovered = make([]int, 256)
Expand All @@ -287,7 +295,7 @@ func compressWithPatternCandidates(ctx context.Context, trace bool, cfg Cfg, log
posMap := make(map[uint64]uint64)
posMaps = append(posMaps, posMap)
wg.Add(1)
go coverWordsByPatternsWorker(trace, ch, out, &wg, &pt, inputSize, outputSize, posMap)
go coverWordsByPatternsWorker(trace, ch, out, &wg, &pt, inputSize, outputSize, posMap, usedPatterns, cfg.EffectiveDictLimit)
}
}
t := time.Now()
Expand Down Expand Up @@ -373,7 +381,7 @@ func compressWithPatternCandidates(ctx context.Context, trace bool, cfg Cfg, log
}
if wordLen > 0 {
if compression {
output, patterns, uncovered = coverWordByPatterns(trace, v, mf2, output[:0], uncovered, patterns, cellRing, uncompPosMap)
output, patterns, uncovered = coverWordByPatterns(trace, v, mf2, output[:0], uncovered, patterns, cellRing, uncompPosMap, usedPatterns, cfg.EffectiveDictLimit)
if _, e := intermediateW.Write(output); e != nil {
return e
}
Expand Down
3 changes: 2 additions & 1 deletion erigon-lib/state/domain.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ type domainVisible struct {

var DomainCompressCfg = seg.Cfg{
MinPatternScore: 1000,
DictReducerSoftLimit: 2000000,
DictReducerSoftLimit: 4_000_000,
EffectiveDictLimit: 2_000_000,
MinPatternLen: 20,
MaxPatternLen: 128,
SamplingFactor: 4,
Expand Down

0 comments on commit c02a1ab

Please sign in to comment.