lizt is a flexible list/file manager. you can create stream iterators or slice iterators as the base. there is a persistent storage wrapper and a seeder wrapper.
Based on the benchmarks, it seems that the SliceIterator is faster in every case. Which is pretty much true, once the slice is in memory.
The major differences between the two iterators are:
- Slice requires holding the entire list in memory and is therefore limited by the amount of memory available. Stream does not have this limitation.
- Slice is faster when the list is small. Stream is faster when the list is large.
- Slices will break eventually. For really large lists (10M+), streams are always the better choice.
- Streams should not be used at all if the list is small. The overhead of re-creating a stream, when round-robin is being used, is not worth it. Instead,
lizt.ReadFromFile(path)
into a SliceIterator.
You can use the MustBuild
and MustBuildWithSeeds
functions to create iterators. These functions will panic if there is an error.
stream, _ := lizt.NewBuilder().Stream("test/50000000.txt").Build() // round-robin = false
fmt.Println(stream.Next(5)) // "a", "b", "c", "e", "f"
stream, _ := lizt.NewBuilder().StreamRR("test/50000000.txt").Build() // round-robin = true
fmt.Println(stream.Next(5)) // "a", "b", "c", "e", "f"
// creates a random string for it's name for ease of use
slice, _ := lizt.B().Slice([]string{"a", "b", "c", "d", "e"}).Build() // round-robin = false
fmt.Println(slice.Next(3)) // "a", "b", "c"
slice, _ := lizt.B().SliceRR([]string{"a", "b", "c", "d", "e"}).Build() // round-robin = true
fmt.Println(slice.Next(3)) // "a", "b", "c"
slice, _ := lizt.B().SliceNamed("name", []string{"a", "b", "c", "d", "e"}).Build() // round-robin = false
fmt.Println(slice.Next(3)) // "a", "b", "c"
slice, _ := lizt.B().SliceNamedRR("name", []string{"a", "b", "c", "d", "e"}).Build() // round-robin = true
fmt.Println(slice.Next(3)) // "a", "b", "c"
plantEvery := 2
seedStream, _ = lizt.B().
Stream("test/10000000.txt").
BuildWithSeeds(plantEvery, []string{"seed1", "seed2"}) // round-robin = false
seedStream, _ = lizt.B().
StreamRR("test/50000000.txt").
BuildWithSeeds(plantEvery, []string{"seed1", "seed2"}) // round-robin = true
fmt.Println(slice.Next(4)) // "seed1", "a", "seed2", "b"
plantEvery := 2
seedStream, _ = lizt.B().
Stream("test/10000000.txt").
BuildWithSeeds(plantEvery, "data/seeds.txt") // round-robin = false
seedStream, _ = lizt.B().
StreamRR("test/50000000.txt").
BuildWithSeeds(plantEvery, "data/seeds.txt") // round-robin = true
fmt.Println(slice.Next(4))// "seed1", "a", "seed2", "b"
mem := NewInMemoryPersister()
stream, _ := lizt.B().StreamRR("test/50000000.txt").PersistTo(mem).Build() // round-robin = false
fmt.Println(stream.Next(5)) // "a", "b", "c", "e", "f"
// Persister Value => mem["50000000"] = 5
mem := NewInMemoryPersister()
stream, _ := lizt.B().Slice([]{"test","this","here"}).PersistTo(mem).Build() // round-robin = false
fmt.Println(stream.Next(3)) // "test", "this", "here"
// Persister Value => mem["10"] = 3
mem := NewInMemoryPersister()
stream, _ := lizt.B().
StreamRR("test/50000000.txt").
PersistTo(mem).
BuildWithSeeds(2, []string{"seed1", "seed2"}) // round-robin = false
fmt.Println(stream.Next(5)) // "seed1", "a", "seed2", "b", "seed1"
// Persister Value => mem["10"] = 2
// creates a random string for it's name for ease of use
bl := map[string]struct{}{"a": {}, "c": {}}
blm := lizt.NewBlacklistManager(bl)
slice, _ := lizt.B().Slice([]string{"a", "b", "c", "d", "e"}).Blacklist(blm).Build() // round-robin = false
fmt.Println(slice.Next(3)) // "b", "d", "e"
// this is good to do so you can reuse the memory
bl := lizt.FileToMap("test/blacklist.txt")
err = lizt.ScrubFileWithBlacklist(bl, "test/10.txt", "test/10.txt.scrubbed")
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
When using the manager, you must use SliceNamed
and SliceNamedRR
. The manager requires a name to properly use it's Get
and MustGet
functions.
When you're using streams, the Get
look up key will always be the filename of the path. I.e. test/50000000.txt
will be found at mgr.Get("50000000")
package main
import "git.faze.center/netr/lizt"
import "git.faze.center/netr/lizt/persist"
type IterKey string
const (
IterKeyFiftyMillion IterKey = "50000000"
IterKeyNumbers IterKey = "numbers"
)
func main() {
ip, err := persist.NewIniPersister("data/persist.ini")
if err != nil {
panic(err)
}
// B() => NewBuilder(), StreamRR() => Stream with Round Robin, PersistTo() => Persist to Persister, BuildWithSeeds() => Build the Iterator with Seeding
fiftyStream := lizt.B().StreamRR("test/50000000.txt").PersistTo(ip).MustBuildWithSeeds(2, []string{"seeder1", "seeder2"}) // round-robin = false
// B() => NewBuilder(), SliceNamedRR() => Named slice with Round Robin, PersistTo() => Persist to Persister, Build() => Build the Iterator
numbers := []string{"1", "2", "3", "4", "5"}
numbersSlice := lizt.B().SliceNamedRR(string(IterKeyNumbers), numbers).PersistTo(ip).MustBuild() // round-robin = false
// initialize and add iterators to the manager
mgr := lizt.NewManager().AddIters(fiftyStream, numbersSlice)
_, err = mgr.MustGet(string(IterKeyFiftyMillion)).Next(4)
if err != nil {
panic(err)
}
// results in ["seeder1", "1", "seeder2", "2"]
_, err = mgr.MustGet(string(IterKeyNumbers)).Next(4)
if err != nil {
panic(err)
}
// results in ["1", "2", "3", "4"]
}
All files in the directory will be added to the manager. The key will be the filename of the file. I.e. test/50000000.txt
will be found at mgr.Get("50000000")
These will always be created as SliceIterators.
package main
import "git.faze.center/netr/lizt"
func main() {
mgr := lizt.NewManager()
// (path, round-robin)
err := mgr.AddDirIter("data/", true)
if err != nil {
panic(err)
}
_, err = mgr.Get("filename")
if err != nil {
panic(err)
}
}
All files in the directory will be added to the manager.
These will be SliceIterators
if the file is less than 250,000 lines.
Otherwise, they will be StreamIterators.
This decision is based around the benchmarks. There's a considerable difference in speed when it comes to smaller lists. The streams perform consistently, regardless of the volume of items they have, after a certain amount of lines.
TODO: Make MaxLinesForSliceIter
configurable.
package main
import "git.faze.center/netr/lizt"
func main() {
mgr := lizt.NewManager()
// (path, round-robin)
err := mgr.SmartAddDirIter("data/", true)
if err != nil {
panic(err)
}
_, err = mgr.Get("filename")
if err != nil {
panic(err)
}
}
Install https://pre-commit.com/#install
run pre-commit install
make test
make bench