From d17efaa1146bf0ec9a52b004e6d79f971253b1a2 Mon Sep 17 00:00:00 2001 From: Lunny Xiao Date: Mon, 7 Sep 2020 06:51:14 +0800 Subject: [PATCH] Upgrade bleve to v1.0.10 (#12737) * Fix bug on migration 111 * Upgrade bleve to 1.0.10 Co-authored-by: zeripath Co-authored-by: techknowlogick --- go.mod | 3 +- go.sum | 26 +- vendor/github.com/blevesearch/bleve/README.md | 7 +- .../github.com/blevesearch/bleve/builder.go | 94 ++ vendor/github.com/blevesearch/bleve/go.mod | 13 +- vendor/github.com/blevesearch/bleve/index.go | 14 + .../blevesearch/bleve/index/index.go | 7 + .../blevesearch/bleve/index/scorch/builder.go | 334 +++++++ .../blevesearch/bleve/index/scorch/event.go | 8 + .../bleve/index/scorch/introducer.go | 18 +- .../blevesearch/bleve/index/scorch/merge.go | 168 +++- .../index/scorch/mergeplan/merge_plan.go | 13 +- .../bleve/index/scorch/optimize.go | 60 +- .../bleve/index/scorch/persister.go | 85 +- .../{snapshot_rollback.go => rollback.go} | 0 .../blevesearch/bleve/index/scorch/scorch.go | 62 +- .../bleve/index/scorch/segment/unadorned.go | 22 +- .../bleve/index/scorch/segment_plugin.go | 26 +- .../bleve/index/scorch/snapshot_index.go | 21 +- .../bleve/index/scorch/snapshot_index_tfr.go | 3 + .../blevesearch/bleve/index/scorch/stats.go | 13 +- .../blevesearch/bleve/index_alias_impl.go | 16 +- .../blevesearch/bleve/index_impl.go | 3 +- .../blevesearch/bleve/mapping/document.go | 3 +- .../blevesearch/bleve/mapping/index.go | 16 +- vendor/github.com/blevesearch/bleve/search.go | 23 + .../search/searcher/search_disjunction.go | 14 +- .../searcher/search_disjunction_heap.go | 4 +- .../searcher/search_disjunction_slice.go | 4 +- .../bleve/search/searcher/search_fuzzy.go | 4 +- .../search/searcher/search_geoboundingbox.go | 184 ++-- .../search/searcher/search_multi_term.go | 180 +++- .../search/searcher/search_numeric_range.go | 7 +- .../bleve/search/searcher/search_regexp.go | 2 +- .../bleve/search/searcher/search_term.go | 2 +- .../search/searcher/search_term_prefix.go | 2 +- .../blevesearch/bleve/search/sort.go | 9 +- vendor/github.com/blevesearch/zap/v11/go.mod | 6 +- vendor/github.com/blevesearch/zap/v12/go.mod | 6 +- .../github.com/blevesearch/zap/v13/.gitignore | 12 + vendor/github.com/blevesearch/zap/v13/LICENSE | 202 ++++ .../github.com/blevesearch/zap/v13/README.md | 158 ++++ .../github.com/blevesearch/zap/v13/build.go | 156 ++++ .../github.com/blevesearch/zap/v13/chunk.go | 54 ++ .../blevesearch/zap/v13/contentcoder.go | 243 +++++ .../github.com/blevesearch/zap/v13/count.go | 61 ++ vendor/github.com/blevesearch/zap/v13/dict.go | 263 ++++++ .../blevesearch/zap/v13/docvalues.go | 312 +++++++ .../blevesearch/zap/v13/enumerator.go | 138 +++ vendor/github.com/blevesearch/zap/v13/go.mod | 12 + .../blevesearch/zap/v13/intDecoder.go | 111 +++ .../blevesearch/zap/v13/intcoder.go | 206 +++++ .../github.com/blevesearch/zap/v13/merge.go | 847 +++++++++++++++++ vendor/github.com/blevesearch/zap/v13/new.go | 860 ++++++++++++++++++ .../github.com/blevesearch/zap/v13/plugin.go | 37 + .../github.com/blevesearch/zap/v13/posting.go | 798 ++++++++++++++++ vendor/github.com/blevesearch/zap/v13/read.go | 43 + .../github.com/blevesearch/zap/v13/segment.go | 572 ++++++++++++ .../github.com/blevesearch/zap/v13/write.go | 145 +++ vendor/github.com/blevesearch/zap/v13/zap.md | 177 ++++ .../github.com/blevesearch/zap/v14/.gitignore | 12 + vendor/github.com/blevesearch/zap/v14/LICENSE | 202 ++++ .../github.com/blevesearch/zap/v14/README.md | 158 ++++ .../github.com/blevesearch/zap/v14/build.go | 156 ++++ .../github.com/blevesearch/zap/v14/chunk.go | 67 ++ .../blevesearch/zap/v14/contentcoder.go | 243 +++++ .../github.com/blevesearch/zap/v14/count.go | 61 ++ vendor/github.com/blevesearch/zap/v14/dict.go | 263 ++++++ .../blevesearch/zap/v14/docvalues.go | 312 +++++++ .../blevesearch/zap/v14/enumerator.go | 138 +++ vendor/github.com/blevesearch/zap/v14/go.mod | 12 + .../blevesearch/zap/v14/intDecoder.go | 118 +++ .../blevesearch/zap/v14/intcoder.go | 206 +++++ .../github.com/blevesearch/zap/v14/merge.go | 847 +++++++++++++++++ vendor/github.com/blevesearch/zap/v14/new.go | 860 ++++++++++++++++++ .../github.com/blevesearch/zap/v14/plugin.go | 37 + .../github.com/blevesearch/zap/v14/posting.go | 796 ++++++++++++++++ vendor/github.com/blevesearch/zap/v14/read.go | 43 + .../github.com/blevesearch/zap/v14/segment.go | 572 ++++++++++++ .../github.com/blevesearch/zap/v14/write.go | 145 +++ vendor/github.com/blevesearch/zap/v14/zap.md | 177 ++++ vendor/github.com/couchbase/vellum/README.md | 2 +- .../couchbase/vellum/fst_iterator.go | 2 +- vendor/go.etcd.io/bbolt/README.md | 11 +- vendor/go.etcd.io/bbolt/freelist.go | 57 +- vendor/go.etcd.io/bbolt/node.go | 25 +- vendor/go.etcd.io/bbolt/page.go | 57 +- vendor/go.etcd.io/bbolt/tx.go | 27 +- vendor/go.etcd.io/bbolt/unsafe.go | 39 + vendor/modules.txt | 15 +- 90 files changed, 12116 insertions(+), 433 deletions(-) create mode 100644 vendor/github.com/blevesearch/bleve/builder.go create mode 100644 vendor/github.com/blevesearch/bleve/index/scorch/builder.go rename vendor/github.com/blevesearch/bleve/index/scorch/{snapshot_rollback.go => rollback.go} (100%) create mode 100644 vendor/github.com/blevesearch/zap/v13/.gitignore create mode 100644 vendor/github.com/blevesearch/zap/v13/LICENSE create mode 100644 vendor/github.com/blevesearch/zap/v13/README.md create mode 100644 vendor/github.com/blevesearch/zap/v13/build.go create mode 100644 vendor/github.com/blevesearch/zap/v13/chunk.go create mode 100644 vendor/github.com/blevesearch/zap/v13/contentcoder.go create mode 100644 vendor/github.com/blevesearch/zap/v13/count.go create mode 100644 vendor/github.com/blevesearch/zap/v13/dict.go create mode 100644 vendor/github.com/blevesearch/zap/v13/docvalues.go create mode 100644 vendor/github.com/blevesearch/zap/v13/enumerator.go create mode 100644 vendor/github.com/blevesearch/zap/v13/go.mod create mode 100644 vendor/github.com/blevesearch/zap/v13/intDecoder.go create mode 100644 vendor/github.com/blevesearch/zap/v13/intcoder.go create mode 100644 vendor/github.com/blevesearch/zap/v13/merge.go create mode 100644 vendor/github.com/blevesearch/zap/v13/new.go create mode 100644 vendor/github.com/blevesearch/zap/v13/plugin.go create mode 100644 vendor/github.com/blevesearch/zap/v13/posting.go create mode 100644 vendor/github.com/blevesearch/zap/v13/read.go create mode 100644 vendor/github.com/blevesearch/zap/v13/segment.go create mode 100644 vendor/github.com/blevesearch/zap/v13/write.go create mode 100644 vendor/github.com/blevesearch/zap/v13/zap.md create mode 100644 vendor/github.com/blevesearch/zap/v14/.gitignore create mode 100644 vendor/github.com/blevesearch/zap/v14/LICENSE create mode 100644 vendor/github.com/blevesearch/zap/v14/README.md create mode 100644 vendor/github.com/blevesearch/zap/v14/build.go create mode 100644 vendor/github.com/blevesearch/zap/v14/chunk.go create mode 100644 vendor/github.com/blevesearch/zap/v14/contentcoder.go create mode 100644 vendor/github.com/blevesearch/zap/v14/count.go create mode 100644 vendor/github.com/blevesearch/zap/v14/dict.go create mode 100644 vendor/github.com/blevesearch/zap/v14/docvalues.go create mode 100644 vendor/github.com/blevesearch/zap/v14/enumerator.go create mode 100644 vendor/github.com/blevesearch/zap/v14/go.mod create mode 100644 vendor/github.com/blevesearch/zap/v14/intDecoder.go create mode 100644 vendor/github.com/blevesearch/zap/v14/intcoder.go create mode 100644 vendor/github.com/blevesearch/zap/v14/merge.go create mode 100644 vendor/github.com/blevesearch/zap/v14/new.go create mode 100644 vendor/github.com/blevesearch/zap/v14/plugin.go create mode 100644 vendor/github.com/blevesearch/zap/v14/posting.go create mode 100644 vendor/github.com/blevesearch/zap/v14/read.go create mode 100644 vendor/github.com/blevesearch/zap/v14/segment.go create mode 100644 vendor/github.com/blevesearch/zap/v14/write.go create mode 100644 vendor/github.com/blevesearch/zap/v14/zap.md create mode 100644 vendor/go.etcd.io/bbolt/unsafe.go diff --git a/go.mod b/go.mod index ad1105eb3..2a63fb5d0 100644 --- a/go.mod +++ b/go.mod @@ -18,9 +18,8 @@ require ( gitea.com/macaron/toolbox v0.0.0-20190822013122-05ff0fc766b7 github.com/BurntSushi/toml v0.3.1 github.com/PuerkitoBio/goquery v1.5.1 - github.com/RoaringBitmap/roaring v0.4.23 // indirect github.com/alecthomas/chroma v0.8.0 - github.com/blevesearch/bleve v1.0.7 + github.com/blevesearch/bleve v1.0.10 github.com/couchbase/gomemcached v0.0.0-20191004160342-7b5da2ec40b2 // indirect github.com/cznic/b v0.0.0-20181122101859-a26611c4d92d // indirect github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548 // indirect diff --git a/go.sum b/go.sum index 1699adeef..04c55ce7d 100644 --- a/go.sum +++ b/go.sum @@ -63,8 +63,6 @@ github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tN github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/RoaringBitmap/roaring v0.4.21 h1:WJ/zIlNX4wQZ9x8Ey33O1UaD9TCTakYsdLFSBcTwH+8= -github.com/RoaringBitmap/roaring v0.4.21/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo= github.com/RoaringBitmap/roaring v0.4.23 h1:gpyfd12QohbqhFO4NVDUdoPOCXsyahYRQhINmlHxKeo= github.com/RoaringBitmap/roaring v0.4.23/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo= github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= @@ -117,8 +115,8 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/blevesearch/bleve v1.0.7 h1:4PspZE7XABMSKcVpzAKp0E05Yer1PIYmTWk+1ngNr/c= -github.com/blevesearch/bleve v1.0.7/go.mod h1:3xvmBtaw12Y4C9iA1RTzwWCof5j5HjydjCTiDE2TeE0= +github.com/blevesearch/bleve v1.0.10 h1:DxFXeC+faL+5LVTlljUDpP9eXj3mleiQem3DuSjepqQ= +github.com/blevesearch/bleve v1.0.10/go.mod h1:KHAOH5HuVGn9fo+dN5TkqcA1HcuOQ89goLWVWXZDl8w= github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040 h1:SjYVcfJVZoCfBlg+fkaq2eoZHTf5HaJfaTeTkOtyfHQ= github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040/go.mod h1:WH+MU2F4T0VmSdaPX+Wu5GYoZBrYWdOZWSjzvYcDmqQ= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= @@ -129,10 +127,14 @@ github.com/blevesearch/segment v0.9.0 h1:5lG7yBCx98or7gK2cHMKPukPZ/31Kag7nONpoBt github.com/blevesearch/segment v0.9.0/go.mod h1:9PfHYUdQCgHktBgvtUOF4x+pc4/l8rdH0u5spnW85UQ= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= -github.com/blevesearch/zap/v11 v11.0.7 h1:nnmAOP6eXBkqEa1Srq1eqA5Wmn4w+BZjLdjynNxvd+M= -github.com/blevesearch/zap/v11 v11.0.7/go.mod h1:bJoY56fdU2m/IP4LLz/1h4jY2thBoREvoqbuJ8zhm9k= -github.com/blevesearch/zap/v12 v12.0.7 h1:y8FWSAYkdc4p1dn4YLxNNr1dxXlSUsakJh2Fc/r6cj4= -github.com/blevesearch/zap/v12 v12.0.7/go.mod h1:70DNK4ZN4tb42LubeDbfpp6xnm8g3ROYVvvZ6pEoXD8= +github.com/blevesearch/zap/v11 v11.0.10 h1:zJdl+cnxT0Yt2hA6meG+OIat3oSA4rERfrNX2CSchII= +github.com/blevesearch/zap/v11 v11.0.10/go.mod h1:BdqdgKy6u0Jgw/CqrMfP2Gue/EldcfvB/3eFzrzhIfw= +github.com/blevesearch/zap/v12 v12.0.10 h1:T1/GXNBxC9eetfuMwCM5RLWXeharSMyAdNEdXVtBuHA= +github.com/blevesearch/zap/v12 v12.0.10/go.mod h1:QtKkjpmV/sVFEnKSaIWPXZJAaekL97TrTV3ImhNx+nw= +github.com/blevesearch/zap/v13 v13.0.2 h1:quhI5OVFX33dhPpUW+nLyXGpu7QT8qTgzu6qA/fRRXM= +github.com/blevesearch/zap/v13 v13.0.2/go.mod h1:/9QLKla8/8mloJvQQutPhB+tw6y35urvKeAFeun2JGA= +github.com/blevesearch/zap/v14 v14.0.1 h1:s8KeqX53Vc4eRaziHsnY2bYUE+8IktWqRL9W5H5VDMY= +github.com/blevesearch/zap/v14 v14.0.1/go.mod h1:Y+tUL9TypMca5+96m7iJb2lpcntETXSeDoI5BBX2tvY= github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc h1:biVzkmvwrH8WK8raXaxBx6fRVTlJILwEwQGL1I/ByEI= github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/bradfitz/gomemcache v0.0.0-20190329173943-551aad21a668 h1:U/lr3Dgy4WK+hNk4tyD+nuGjpVLPEHuJSFXMw11/HPA= @@ -164,8 +166,8 @@ github.com/couchbase/goutils v0.0.0-20190315194238-f9d42b11473b/go.mod h1:BQwMFl github.com/couchbase/goutils v0.0.0-20191018232750-b49639060d85 h1:0WMIDtuXCKEm4wtAJgAAXa/qtM5O9MariLwgHaRlYmk= github.com/couchbase/goutils v0.0.0-20191018232750-b49639060d85/go.mod h1:BQwMFlJzDjFDG3DJUdU0KORxn88UlsOULuxLExMh3Hs= github.com/couchbase/moss v0.1.0/go.mod h1:9MaHIaRuy9pvLPUJxB8sh8OrLfyDczECVL37grCIubs= -github.com/couchbase/vellum v1.0.1 h1:qrj9ohvZedvc51S5KzPfJ6P6z0Vqzv7Lx7k3mVc2WOk= -github.com/couchbase/vellum v1.0.1/go.mod h1:FcwrEivFpNi24R3jLOs3n+fs5RnuQnQqCLBJ1uAg1W4= +github.com/couchbase/vellum v1.0.2 h1:BrbP0NKiyDdndMPec8Jjhy0U47CZ0Lgx3xUC2r9rZqw= +github.com/couchbase/vellum v1.0.2/go.mod h1:FcwrEivFpNi24R3jLOs3n+fs5RnuQnQqCLBJ1uAg1W4= github.com/couchbaselabs/go-couchbase v0.0.0-20190708161019-23e7ca2ce2b7 h1:1XjEY/gnjQ+AfXef2U6dxCquhiRzkEpxZuWqs+QxTL8= github.com/couchbaselabs/go-couchbase v0.0.0-20190708161019-23e7ca2ce2b7/go.mod h1:mby/05p8HE5yHEAKiIH/555NoblMs7PtW6NrYshDruc= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= @@ -914,8 +916,8 @@ github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxt github.com/ziutek/mymysql v1.5.4 h1:GB0qdRGsTwQSBVYuVShFBKaXSnSnYYC2d9knnE1LHFs= github.com/ziutek/mymysql v1.5.4/go.mod h1:LMSpPZ6DbqWFxNCHW77HeMg9I646SAhApZ/wKdgO/C0= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= -go.etcd.io/bbolt v1.3.4 h1:hi1bXHMVrlQh6WwxAy+qZCV/SYIlqo+Ushwdpa4tAKg= -go.etcd.io/bbolt v1.3.4/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= +go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0= +go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= go.mongodb.org/mongo-driver v1.0.3/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.mongodb.org/mongo-driver v1.1.1 h1:Sq1fR+0c58RME5EoqKdjkiQAmPjmfHlZOoRI6fTUOcs= go.mongodb.org/mongo-driver v1.1.1/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= diff --git a/vendor/github.com/blevesearch/bleve/README.md b/vendor/github.com/blevesearch/bleve/README.md index 7c1a7c7c4..eff0be97e 100644 --- a/vendor/github.com/blevesearch/bleve/README.md +++ b/vendor/github.com/blevesearch/bleve/README.md @@ -1,10 +1,13 @@ # ![bleve](docs/bleve.png) bleve -[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) +[![Tests](https://github.com/blevesearch/bleve/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) +[![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) +[![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve) [![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) -[![Sourcegraph](https://sourcegraph.com/github.com/blevesearch/bleve/-/badge.svg)](https://sourcegraph.com/github.com/blevesearch/bleve?badge) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Sourcegraph](https://sourcegraph.com/github.com/blevesearch/bleve/-/badge.svg)](https://sourcegraph.com/github.com/blevesearch/bleve?badge) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/) diff --git a/vendor/github.com/blevesearch/bleve/builder.go b/vendor/github.com/blevesearch/bleve/builder.go new file mode 100644 index 000000000..de00c97b6 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/builder.go @@ -0,0 +1,94 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bleve + +import ( + "encoding/json" + "fmt" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/mapping" +) + +type builderImpl struct { + b index.IndexBuilder + m mapping.IndexMapping +} + +func (b *builderImpl) Index(id string, data interface{}) error { + if id == "" { + return ErrorEmptyID + } + + doc := document.NewDocument(id) + err := b.m.MapDocument(doc, data) + if err != nil { + return err + } + err = b.b.Index(doc) + return err +} + +func (b *builderImpl) Close() error { + return b.b.Close() +} + +func newBuilder(path string, mapping mapping.IndexMapping, config map[string]interface{}) (Builder, error) { + if path == "" { + return nil, fmt.Errorf("builder requires path") + } + + err := mapping.Validate() + if err != nil { + return nil, err + } + + if config == nil { + config = map[string]interface{}{} + } + + // the builder does not have an API to interact with internal storage + // however we can pass k/v pairs through the config + mappingBytes, err := json.Marshal(mapping) + if err != nil { + return nil, err + } + config["internal"] = map[string][]byte{ + string(mappingInternalKey): mappingBytes, + } + + // do not use real config, as these are options for the builder, + // not the resulting index + meta := newIndexMeta(scorch.Name, scorch.Name, map[string]interface{}{}) + err = meta.Save(path) + if err != nil { + return nil, err + } + + config["path"] = indexStorePath(path) + + b, err := scorch.NewBuilder(config) + if err != nil { + return nil, err + } + rv := &builderImpl{ + b: b, + m: mapping, + } + + return rv, nil +} diff --git a/vendor/github.com/blevesearch/bleve/go.mod b/vendor/github.com/blevesearch/bleve/go.mod index d38cf8f92..9a12454fd 100644 --- a/vendor/github.com/blevesearch/bleve/go.mod +++ b/vendor/github.com/blevesearch/bleve/go.mod @@ -3,16 +3,17 @@ module github.com/blevesearch/bleve go 1.13 require ( - github.com/RoaringBitmap/roaring v0.4.21 + github.com/RoaringBitmap/roaring v0.4.23 github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.7 - github.com/blevesearch/zap/v12 v12.0.7 - github.com/couchbase/ghistogram v0.1.0 // indirect + github.com/blevesearch/zap/v11 v11.0.10 + github.com/blevesearch/zap/v12 v12.0.10 + github.com/blevesearch/zap/v13 v13.0.2 + github.com/blevesearch/zap/v14 v14.0.1 github.com/couchbase/moss v0.1.0 - github.com/couchbase/vellum v1.0.1 + github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2 github.com/kljensen/snowball v0.6.0 github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563 @@ -20,6 +21,6 @@ require ( github.com/steveyen/gtreap v0.1.0 github.com/syndtr/goleveldb v1.0.0 github.com/willf/bitset v1.1.10 - go.etcd.io/bbolt v1.3.4 + go.etcd.io/bbolt v1.3.5 golang.org/x/text v0.3.0 ) diff --git a/vendor/github.com/blevesearch/bleve/index.go b/vendor/github.com/blevesearch/bleve/index.go index ef6ede934..974358b81 100644 --- a/vendor/github.com/blevesearch/bleve/index.go +++ b/vendor/github.com/blevesearch/bleve/index.go @@ -293,3 +293,17 @@ func Open(path string) (Index, error) { func OpenUsing(path string, runtimeConfig map[string]interface{}) (Index, error) { return openIndexUsing(path, runtimeConfig) } + +// Builder is a limited interface, used to build indexes in an offline mode. +// Items cannot be updated or deleted, and the caller MUST ensure a document is +// indexed only once. +type Builder interface { + Index(id string, data interface{}) error + Close() error +} + +// NewBuilder creates a builder, which will build an index at the specified path, +// using the specified mapping and options. +func NewBuilder(path string, mapping mapping.IndexMapping, config map[string]interface{}) (Builder, error) { + return newBuilder(path, mapping, config) +} diff --git a/vendor/github.com/blevesearch/bleve/index/index.go b/vendor/github.com/blevesearch/bleve/index/index.go index 3e866f3aa..551f8de84 100644 --- a/vendor/github.com/blevesearch/bleve/index/index.go +++ b/vendor/github.com/blevesearch/bleve/index/index.go @@ -367,3 +367,10 @@ type OptimizableContext interface { type DocValueReader interface { VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error } + +// IndexBuilder is an interface supported by some index schemes +// to allow direct write-only index building +type IndexBuilder interface { + Index(doc *document.Document) error + Close() error +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/builder.go b/vendor/github.com/blevesearch/bleve/index/scorch/builder.go new file mode 100644 index 000000000..1f4b41d63 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/index/scorch/builder.go @@ -0,0 +1,334 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "io/ioutil" + "os" + "sync" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + bolt "go.etcd.io/bbolt" +) + +const DefaultBuilderBatchSize = 1000 +const DefaultBuilderMergeMax = 10 + +type Builder struct { + m sync.Mutex + segCount uint64 + path string + buildPath string + segPaths []string + batchSize int + mergeMax int + batch *index.Batch + internal map[string][]byte + segPlugin segment.Plugin +} + +func NewBuilder(config map[string]interface{}) (*Builder, error) { + path, ok := config["path"].(string) + if !ok { + return nil, fmt.Errorf("must specify path") + } + + buildPathPrefix, _ := config["buildPathPrefix"].(string) + buildPath, err := ioutil.TempDir(buildPathPrefix, "scorch-offline-build") + if err != nil { + return nil, err + } + + rv := &Builder{ + path: path, + buildPath: buildPath, + mergeMax: DefaultBuilderMergeMax, + batchSize: DefaultBuilderBatchSize, + batch: index.NewBatch(), + segPlugin: defaultSegmentPlugin, + } + + err = rv.parseConfig(config) + if err != nil { + return nil, fmt.Errorf("error parsing builder config: %v", err) + } + + return rv, nil +} + +func (o *Builder) parseConfig(config map[string]interface{}) (err error) { + if v, ok := config["mergeMax"]; ok { + var t int + if t, err = parseToInteger(v); err != nil { + return fmt.Errorf("mergeMax parse err: %v", err) + } + if t > 0 { + o.mergeMax = t + } + } + + if v, ok := config["batchSize"]; ok { + var t int + if t, err = parseToInteger(v); err != nil { + return fmt.Errorf("batchSize parse err: %v", err) + } + if t > 0 { + o.batchSize = t + } + } + + if v, ok := config["internal"]; ok { + if vinternal, ok := v.(map[string][]byte); ok { + o.internal = vinternal + } + } + + forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config) + if err != nil { + return err + } + if forcedSegmentType != "" && forcedSegmentVersion != 0 { + segPlugin, err := chooseSegmentPlugin(forcedSegmentType, + uint32(forcedSegmentVersion)) + if err != nil { + return err + } + o.segPlugin = segPlugin + } + + return nil +} + +// Index will place the document into the index. +// It is invalid to index the same document multiple times. +func (o *Builder) Index(doc *document.Document) error { + o.m.Lock() + defer o.m.Unlock() + + o.batch.Update(doc) + + return o.maybeFlushBatchLOCKED(o.batchSize) +} + +func (o *Builder) maybeFlushBatchLOCKED(moreThan int) error { + if len(o.batch.IndexOps) >= moreThan { + defer o.batch.Reset() + return o.executeBatchLOCKED(o.batch) + } + return nil +} + +func (o *Builder) executeBatchLOCKED(batch *index.Batch) (err error) { + analysisResults := make([]*index.AnalysisResult, 0, len(batch.IndexOps)) + for _, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + doc.AddField(document.NewTextFieldCustom("_id", nil, []byte(doc.ID), document.IndexField|document.StoreField, nil)) + // perform analysis directly + analysisResult := analyze(doc) + analysisResults = append(analysisResults, analysisResult) + } + } + + seg, _, err := o.segPlugin.New(analysisResults) + if err != nil { + return fmt.Errorf("error building segment base: %v", err) + } + + filename := zapFileName(o.segCount) + o.segCount++ + path := o.buildPath + string(os.PathSeparator) + filename + + if segUnpersisted, ok := seg.(segment.UnpersistedSegment); ok { + err = segUnpersisted.Persist(path) + if err != nil { + return fmt.Errorf("error persisting segment base to %s: %v", path, err) + } + + o.segPaths = append(o.segPaths, path) + return nil + } + + return fmt.Errorf("new segment does not implement unpersisted: %T", seg) +} + +func (o *Builder) doMerge() error { + // as long as we have more than 1 segment, keep merging + for len(o.segPaths) > 1 { + + // merge the next number of segments into one new one + // or, if there are fewer than remaining, merge them all + mergeCount := o.mergeMax + if mergeCount > len(o.segPaths) { + mergeCount = len(o.segPaths) + } + + mergePaths := o.segPaths[0:mergeCount] + o.segPaths = o.segPaths[mergeCount:] + + // open each of the segments to be merged + mergeSegs := make([]segment.Segment, 0, mergeCount) + + // closeOpenedSegs attempts to close all opened + // segments even if an error occurs, in which case + // the first error is returned + closeOpenedSegs := func() error { + var err error + for _, seg := range mergeSegs { + clErr := seg.Close() + if clErr != nil && err == nil { + err = clErr + } + } + return err + } + + for _, mergePath := range mergePaths { + seg, err := o.segPlugin.Open(mergePath) + if err != nil { + _ = closeOpenedSegs() + return fmt.Errorf("error opening segment (%s) for merge: %v", mergePath, err) + } + mergeSegs = append(mergeSegs, seg) + } + + // do the merge + mergedSegPath := o.buildPath + string(os.PathSeparator) + zapFileName(o.segCount) + drops := make([]*roaring.Bitmap, mergeCount) + _, _, err := o.segPlugin.Merge(mergeSegs, drops, mergedSegPath, nil, nil) + if err != nil { + _ = closeOpenedSegs() + return fmt.Errorf("error merging segments (%v): %v", mergePaths, err) + } + o.segCount++ + o.segPaths = append(o.segPaths, mergedSegPath) + + // close segments opened for merge + err = closeOpenedSegs() + if err != nil { + return fmt.Errorf("error closing opened segments: %v", err) + } + + // remove merged segments + for _, mergePath := range mergePaths { + err = os.RemoveAll(mergePath) + if err != nil { + return fmt.Errorf("error removing segment %s after merge: %v", mergePath, err) + } + } + } + + return nil +} + +func (o *Builder) Close() error { + o.m.Lock() + defer o.m.Unlock() + + // see if there is a partial batch + err := o.maybeFlushBatchLOCKED(1) + if err != nil { + return fmt.Errorf("error flushing batch before close: %v", err) + } + + // perform all the merging + err = o.doMerge() + if err != nil { + return fmt.Errorf("error while merging: %v", err) + } + + // ensure the store path exists + err = os.MkdirAll(o.path, 0700) + if err != nil { + return err + } + + // move final segment into place + // segment id 2 is chosen to match the behavior of a scorch + // index which indexes a single batch of data + finalSegPath := o.path + string(os.PathSeparator) + zapFileName(2) + err = os.Rename(o.segPaths[0], finalSegPath) + if err != nil { + return fmt.Errorf("error moving final segment into place: %v", err) + } + + // remove the buildPath, as it is no longer needed + err = os.RemoveAll(o.buildPath) + if err != nil { + return fmt.Errorf("error removing build path: %v", err) + } + + // prepare wrapping + seg, err := o.segPlugin.Open(finalSegPath) + if err != nil { + return fmt.Errorf("error opening final segment") + } + + // create a segment snapshot for this segment + ss := &SegmentSnapshot{ + segment: seg, + } + is := &IndexSnapshot{ + epoch: 3, // chosen to match scorch behavior when indexing a single batch + segment: []*SegmentSnapshot{ss}, + creator: "scorch-builder", + internal: o.internal, + } + + // create the root bolt + rootBoltPath := o.path + string(os.PathSeparator) + "root.bolt" + rootBolt, err := bolt.Open(rootBoltPath, 0600, nil) + if err != nil { + return err + } + + // start a write transaction + tx, err := rootBolt.Begin(true) + if err != nil { + return err + } + + // fill the root bolt with this fake index snapshot + _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin) + if err != nil { + _ = tx.Rollback() + _ = rootBolt.Close() + return fmt.Errorf("error preparing bolt snapshot in root.bolt: %v", err) + } + + // commit bolt data + err = tx.Commit() + if err != nil { + _ = rootBolt.Close() + return fmt.Errorf("error committing bolt tx in root.bolt: %v", err) + } + + // close bolt + err = rootBolt.Close() + if err != nil { + return fmt.Errorf("error closing root.bolt: %v", err) + } + + // close final segment + err = seg.Close() + if err != nil { + return fmt.Errorf("error closing final segment: %v", err) + } + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/event.go b/vendor/github.com/blevesearch/bleve/index/scorch/event.go index dd79d6d06..8f3fc1914 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/event.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/event.go @@ -54,3 +54,11 @@ var EventKindBatchIntroductionStart = EventKind(5) // EventKindBatchIntroduction is fired when Batch() completes. var EventKindBatchIntroduction = EventKind(6) + +// EventKindMergeTaskIntroductionStart is fired when the merger is about to +// start the introduction of merged segment from a single merge task. +var EventKindMergeTaskIntroductionStart = EventKind(7) + +// EventKindMergeTaskIntroduction is fired when the merger has completed +// the introduction of merged segment from a single merge task. +var EventKindMergeTaskIntroduction = EventKind(8) diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go b/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go index e5f00f80e..7770c41c5 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/introducer.go @@ -45,13 +45,7 @@ type epochWatcher struct { notifyCh notificationChan } -type snapshotReversion struct { - snapshot *IndexSnapshot - applied chan error - persisted chan error -} - -func (s *Scorch) mainLoop() { +func (s *Scorch) introducerLoop() { var epochWatchers []*epochWatcher OUTER: for { @@ -389,6 +383,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } + var skipped bool // In case where all the docs in the newly merged segment getting // deleted by the time we reach here, can skip the introduction. if nextMerge.new != nil && @@ -411,6 +406,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() memSegments++ } + } else { + skipped = true + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsObsoleted, 1) } atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) @@ -435,8 +433,10 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } // notify requester that we incorporated this - nextMerge.notify <- newSnapshot - close(nextMerge.notify) + nextMerge.notifyCh <- &mergeTaskIntroStatus{ + indexSnapshot: newSnapshot, + skipped: skipped} + close(nextMerge.notifyCh) } func isMemorySegment(s *SegmentSnapshot) bool { diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/merge.go b/vendor/github.com/blevesearch/bleve/index/scorch/merge.go index 37dca529a..56c0953f4 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/merge.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/merge.go @@ -15,6 +15,7 @@ package scorch import ( + "context" "encoding/json" "fmt" "os" @@ -29,12 +30,16 @@ import ( func (s *Scorch) mergerLoop() { var lastEpochMergePlanned uint64 + var ctrlMsg *mergerCtrl mergePlannerOptions, err := s.parseMergePlannerOptions() if err != nil { s.fireAsyncError(fmt.Errorf("mergePlannerOption json parsing err: %v", err)) s.asyncTasks.Done() return } + ctrlMsgDflt := &mergerCtrl{ctx: context.Background(), + options: mergePlannerOptions, + doneCh: nil} OUTER: for { @@ -53,16 +58,30 @@ OUTER: atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) s.rootLock.Unlock() - if ourSnapshot.epoch != lastEpochMergePlanned { + if ctrlMsg == nil && ourSnapshot.epoch != lastEpochMergePlanned { + ctrlMsg = ctrlMsgDflt + } + if ctrlMsg != nil { startTime := time.Now() // lets get started - err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) + err := s.planMergeAtSnapshot(ctrlMsg.ctx, ctrlMsg.options, + ourSnapshot) if err != nil { atomic.StoreUint64(&s.iStats.mergeEpoch, 0) if err == segment.ErrClosed { // index has been closed _ = ourSnapshot.DecRef() + + // continue the workloop on a user triggered cancel + if ctrlMsg.doneCh != nil { + close(ctrlMsg.doneCh) + ctrlMsg = nil + continue OUTER + } + + // exit the workloop on index closure + ctrlMsg = nil break OUTER } s.fireAsyncError(fmt.Errorf("merging err: %v", err)) @@ -70,6 +89,12 @@ OUTER: atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) continue OUTER } + + if ctrlMsg.doneCh != nil { + close(ctrlMsg.doneCh) + } + ctrlMsg = nil + lastEpochMergePlanned = ourSnapshot.epoch atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch) @@ -90,6 +115,8 @@ OUTER: case <-s.closeCh: break OUTER case s.persisterNotifier <- ew: + case ctrlMsg = <-s.forceMergeRequestCh: + continue OUTER } // now wait for persister (but also detect close) @@ -97,6 +124,7 @@ OUTER: case <-s.closeCh: break OUTER case <-ew.notifyCh: + case ctrlMsg = <-s.forceMergeRequestCh: } } @@ -106,6 +134,58 @@ OUTER: s.asyncTasks.Done() } +type mergerCtrl struct { + ctx context.Context + options *mergeplan.MergePlanOptions + doneCh chan struct{} +} + +// ForceMerge helps users trigger a merge operation on +// an online scorch index. +func (s *Scorch) ForceMerge(ctx context.Context, + mo *mergeplan.MergePlanOptions) error { + // check whether force merge is already under processing + s.rootLock.Lock() + if s.stats.TotFileMergeForceOpsStarted > + s.stats.TotFileMergeForceOpsCompleted { + s.rootLock.Unlock() + return fmt.Errorf("force merge already in progress") + } + + s.stats.TotFileMergeForceOpsStarted++ + s.rootLock.Unlock() + + if mo != nil { + err := mergeplan.ValidateMergePlannerOptions(mo) + if err != nil { + return err + } + } else { + // assume the default single segment merge policy + mo = &mergeplan.SingleSegmentMergePlanOptions + } + msg := &mergerCtrl{options: mo, + doneCh: make(chan struct{}), + ctx: ctx, + } + + // request the merger perform a force merge + select { + case s.forceMergeRequestCh <- msg: + case <-s.closeCh: + return nil + } + + // wait for the force merge operation completion + select { + case <-msg.doneCh: + atomic.AddUint64(&s.stats.TotFileMergeForceOpsCompleted, 1) + case <-s.closeCh: + } + + return nil +} + func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, error) { mergePlannerOptions := mergeplan.DefaultMergePlanOptions @@ -128,8 +208,39 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, return &mergePlannerOptions, nil } -func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, - options *mergeplan.MergePlanOptions) error { +type closeChWrapper struct { + ch1 chan struct{} + ctx context.Context + closeCh chan struct{} +} + +func newCloseChWrapper(ch1 chan struct{}, + ctx context.Context) *closeChWrapper { + return &closeChWrapper{ch1: ch1, + ctx: ctx, + closeCh: make(chan struct{})} +} + +func (w *closeChWrapper) close() { + select { + case <-w.closeCh: + default: + close(w.closeCh) + } +} + +func (w *closeChWrapper) listen() { + select { + case <-w.ch1: + w.close() + case <-w.ctx.Done(): + w.close() + case <-w.closeCh: + } +} + +func (s *Scorch) planMergeAtSnapshot(ctx context.Context, + options *mergeplan.MergePlanOptions, ourSnapshot *IndexSnapshot) error { // build list of persisted segments in this snapshot var onlyPersistedSnapshots []mergeplan.Segment for _, segmentSnapshot := range ourSnapshot.segment { @@ -158,6 +269,11 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // process tasks in serial for now var filenames []string + cw := newCloseChWrapper(s.closeCh, ctx) + defer cw.close() + + go cw.listen() + for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) @@ -194,8 +310,9 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, var oldNewDocNums map[uint64][]uint64 var seg segment.Segment + var filename string if len(segmentsToMerge) > 0 { - filename := zapFileName(newSegmentID) + filename = zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename @@ -203,7 +320,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path, - s.closeCh, s) + cw.closeCh, s) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) @@ -240,9 +357,11 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, old: oldMap, oldNewDocNums: oldNewDocNums, new: seg, - notify: make(chan *IndexSnapshot), + notifyCh: make(chan *mergeTaskIntroStatus), } + s.fireEvent(EventKindMergeTaskIntroductionStart, 0) + // give it to the introducer select { case <-s.closeCh: @@ -255,18 +374,25 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, introStartTime := time.Now() // it is safe to blockingly wait for the merge introduction // here as the introducer is bound to handle the notify channel. - newSnapshot := <-sm.notify + introStatus := <-sm.notifyCh introTime := uint64(time.Since(introStartTime)) atomic.AddUint64(&s.stats.TotFileMergeZapIntroductionTime, introTime) if atomic.LoadUint64(&s.stats.MaxFileMergeZapIntroductionTime) < introTime { atomic.StoreUint64(&s.stats.MaxFileMergeZapIntroductionTime, introTime) } atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) - if newSnapshot != nil { - _ = newSnapshot.DecRef() + if introStatus != nil && introStatus.indexSnapshot != nil { + _ = introStatus.indexSnapshot.DecRef() + if introStatus.skipped { + // close the segment on skipping introduction. + s.unmarkIneligibleForRemoval(filename) + _ = seg.Close() + } } atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) + + s.fireEvent(EventKindMergeTaskIntroduction, 0) } // once all the newly merged segment introductions are done, @@ -279,12 +405,17 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, return nil } +type mergeTaskIntroStatus struct { + indexSnapshot *IndexSnapshot + skipped bool +} + type segmentMerge struct { id uint64 old map[uint64]*SegmentSnapshot oldNewDocNums map[uint64][]uint64 new segment.Segment - notify chan *IndexSnapshot + notifyCh chan *mergeTaskIntroStatus } // perform a merging of the given SegmentBase instances into a new, @@ -334,7 +465,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), new: seg, - notify: make(chan *IndexSnapshot), + notifyCh: make(chan *mergeTaskIntroStatus), } for i, idx := range sbsIndexes { @@ -351,11 +482,20 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, } // blockingly wait for the introduction to complete - newSnapshot := <-sm.notify - if newSnapshot != nil { + var newSnapshot *IndexSnapshot + introStatus := <-sm.notifyCh + if introStatus != nil && introStatus.indexSnapshot != nil { + newSnapshot = introStatus.indexSnapshot atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) + if introStatus.skipped { + // close the segment on skipping introduction. + _ = newSnapshot.DecRef() + _ = seg.Close() + newSnapshot = nil + } } + return newSnapshot, newSegmentID, nil } diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go b/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go index c2a0d3c64..752350662 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go @@ -134,6 +134,17 @@ var DefaultMergePlanOptions = MergePlanOptions{ ReclaimDeletesWeight: 2.0, } +// SingleSegmentMergePlanOptions helps in creating a +// single segment index. +var SingleSegmentMergePlanOptions = MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1 << 30, + TierGrowth: 1.0, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 1 << 30, + ReclaimDeletesWeight: 2.0, +} + // ------------------------------------------- func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { @@ -173,7 +184,7 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { calcBudget = CalcBudget } - budgetNumSegments := CalcBudget(eligiblesLiveSize, minLiveSize, o) + budgetNumSegments := calcBudget(eligiblesLiveSize, minLiveSize, o) scoreSegments := o.ScoreSegments if scoreSegments == nil { diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go b/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go index b9cb9228a..658354cd7 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/optimize.go @@ -16,10 +16,10 @@ package scorch import ( "fmt" - "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "sync/atomic" ) var OptimizeConjunction = true @@ -40,7 +40,7 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, return s.optimizeDisjunctionUnadorned(octx) } - return octx, nil + return nil, nil } var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) @@ -161,16 +161,8 @@ func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err erro // We use an artificial term and field because the optimized // termFieldReader can represent multiple terms and fields. - oTFR := &IndexSnapshotTermFieldReader{ - term: OptimizeTFRConjunctionUnadornedTerm, - field: OptimizeTFRConjunctionUnadornedField, - snapshot: o.snapshot, - iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), - segmentOffset: 0, - includeFreq: false, - includeNorm: false, - includeTermVectors: false, - } + oTFR := o.snapshot.unadornedTermFieldReader( + OptimizeTFRConjunctionUnadornedTerm, OptimizeTFRConjunctionUnadornedField) var actualBMs []*roaring.Bitmap // Collected from regular posting lists. @@ -265,6 +257,7 @@ OUTER: oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm) } + atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1)) return oTFR, nil } @@ -277,7 +270,9 @@ OUTER: func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( octx index.OptimizableContext) (index.OptimizableContext, error) { if octx == nil { - octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} + octx = &OptimizeTFRDisjunctionUnadorned{ + snapshot: s.snapshot, + } } o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) @@ -328,27 +323,12 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro } } } - - // Heuristic to skip the optimization if all the constituent - // bitmaps are too small, where the processing & resource - // overhead to create the OR'ed bitmap outweighs the benefit. - if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { - return nil, nil - } } // We use an artificial term and field because the optimized // termFieldReader can represent multiple terms and fields. - oTFR := &IndexSnapshotTermFieldReader{ - term: OptimizeTFRDisjunctionUnadornedTerm, - field: OptimizeTFRDisjunctionUnadornedField, - snapshot: o.snapshot, - iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), - segmentOffset: 0, - includeFreq: false, - includeNorm: false, - includeTermVectors: false, - } + oTFR := o.snapshot.unadornedTermFieldReader( + OptimizeTFRDisjunctionUnadornedTerm, OptimizeTFRDisjunctionUnadornedField) var docNums []uint32 // Collected docNum's from 1-hit posting lists. var actualBMs []*roaring.Bitmap // Collected from regular posting lists. @@ -392,5 +372,25 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm) } + atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1)) return oTFR, nil } + +// ---------------------------------------------------------------- + +func (i *IndexSnapshot) unadornedTermFieldReader( + term []byte, field string) *IndexSnapshotTermFieldReader { + // This IndexSnapshotTermFieldReader will not be recycled, more + // conversation here: https://github.com/blevesearch/bleve/pull/1438 + return &IndexSnapshotTermFieldReader{ + term: term, + field: field, + snapshot: i, + iterators: make([]segment.PostingsIterator, len(i.segment)), + segmentOffset: 0, + includeFreq: false, + includeNorm: false, + includeTermVectors: false, + recycle: false, + } +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/persister.go b/vendor/github.com/blevesearch/bleve/index/scorch/persister.go index 30e75df77..498378a4f 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/persister.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/persister.go @@ -256,7 +256,7 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, // for sufficient in-memory segments to pile up for the next // memory merge cum persist loop. if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && - po.PersisterNapTimeMSec > 0 && s.paused() == 0 { + po.PersisterNapTimeMSec > 0 && s.NumEventsBlocking() == 0 { select { case <-s.closeCh: case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): @@ -333,7 +333,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, // Perform in-memory segment merging only when the memory pressure is // below the configured threshold, else the persister performs the // direct persistence of segments. - if s.paused() < po.MemoryPressurePauseThreshold { + if s.NumEventsBlocking() < po.MemoryPressurePauseThreshold { persisted, err := s.persistSnapshotMaybeMerge(snapshot) if err != nil { return err @@ -428,55 +428,44 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( return true, nil } -func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { - // start a write transaction - tx, err := s.rootBolt.Begin(true) - if err != nil { - return err - } - // defer rollback on error - defer func() { - if err != nil { - _ = tx.Rollback() - } - }() - +func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, + segPlugin segment.Plugin) ([]string, map[uint64]string, error) { snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket) if err != nil { - return err + return nil, nil, err } newSnapshotKey := segment.EncodeUvarintAscending(nil, snapshot.epoch) snapshotBucket, err := snapshotsBucket.CreateBucketIfNotExists(newSnapshotKey) if err != nil { - return err + return nil, nil, err } // persist meta values metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey) if err != nil { - return err + return nil, nil, err } - err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(s.segPlugin.Type())) + err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(segPlugin.Type())) if err != nil { - return err + return nil, nil, err } buf := make([]byte, binary.MaxVarintLen32) - binary.BigEndian.PutUint32(buf, s.segPlugin.Version()) + binary.BigEndian.PutUint32(buf, segPlugin.Version()) err = metaBucket.Put(boltMetaDataSegmentVersionKey, buf) if err != nil { - return err + return nil, nil, err } // persist internal values internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) if err != nil { - return err + return nil, nil, err } // TODO optimize writing these in order? for k, v := range snapshot.internal { err = internalBucket.Put([]byte(k), v) if err != nil { - return err + return nil, nil, err } } @@ -488,49 +477,69 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id) snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) if err != nil { - return err + return nil, nil, err } switch seg := segmentSnapshot.segment.(type) { case segment.PersistedSegment: - path := seg.Path() - filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) + segPath := seg.Path() + filename := strings.TrimPrefix(segPath, path+string(os.PathSeparator)) err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) if err != nil { - return err + return nil, nil, err } filenames = append(filenames, filename) case segment.UnpersistedSegment: // need to persist this to disk filename := zapFileName(segmentSnapshot.id) - path := s.path + string(os.PathSeparator) + filename + path := path + string(os.PathSeparator) + filename err = seg.Persist(path) if err != nil { - return fmt.Errorf("error persisting segment: %v", err) + return nil, nil, fmt.Errorf("error persisting segment: %v", err) } newSegmentPaths[segmentSnapshot.id] = path err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) if err != nil { - return err + return nil, nil, err } filenames = append(filenames, filename) - default: - return fmt.Errorf("unknown segment type: %T", seg) + return nil, nil, fmt.Errorf("unknown segment type: %T", seg) } // store current deleted bits var roaringBuf bytes.Buffer if segmentSnapshot.deleted != nil { _, err = segmentSnapshot.deleted.WriteTo(&roaringBuf) if err != nil { - return fmt.Errorf("error persisting roaring bytes: %v", err) + return nil, nil, fmt.Errorf("error persisting roaring bytes: %v", err) } err = snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes()) if err != nil { - return err + return nil, nil, err } } } + return filenames, newSegmentPaths, nil +} + +func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { + // start a write transaction + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + // defer rollback on error + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() + + filenames, newSegmentPaths, err := prepareBoltSnapshot(snapshot, tx, s.path, s.segPlugin) + if err != nil { + return err + } + // we need to swap in a new root only when we've persisted 1 or // more segments -- whereby the new root would have 1-for-1 // replacements of in-memory segments with file-based segments @@ -780,12 +789,6 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return rv, nil } -type uint64Descending []uint64 - -func (p uint64Descending) Len() int { return len(p) } -func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] } -func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - func (s *Scorch) removeOldData() { removed, err := s.removeOldBoltSnapshots() if err != nil { diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go b/vendor/github.com/blevesearch/bleve/index/scorch/rollback.go similarity index 100% rename from vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go rename to vendor/github.com/blevesearch/bleve/index/scorch/rollback.go diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go b/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go index 80f9e3a79..ba98a460d 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/scorch.go @@ -73,9 +73,7 @@ type Scorch struct { onEvent func(event Event) onAsyncError func(err error) - pauseLock sync.RWMutex - - pauseCount uint64 + forceMergeRequestCh chan *mergerCtrl segPlugin segment.Plugin } @@ -101,18 +99,15 @@ func NewScorch(storeName string, nextSnapshotEpoch: 1, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, + forceMergeRequestCh: make(chan *mergerCtrl, 1), segPlugin: defaultSegmentPlugin, } - // check if the caller has requested a specific segment type/version - forcedSegmentVersion, ok := config["forceSegmentVersion"].(int) - if ok { - forcedSegmentType, ok2 := config["forceSegmentType"].(string) - if !ok2 { - return nil, fmt.Errorf( - "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) - } - + forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config) + if err != nil { + return nil, err + } + if forcedSegmentType != "" && forcedSegmentVersion != 0 { err := rv.loadSegmentPlugin(forcedSegmentType, uint32(forcedSegmentVersion)) if err != nil { @@ -140,30 +135,34 @@ func NewScorch(storeName string, return rv, nil } -func (s *Scorch) paused() uint64 { - s.pauseLock.Lock() - pc := s.pauseCount - s.pauseLock.Unlock() - return pc -} +// configForceSegmentTypeVersion checks if the caller has requested a +// specific segment type/version +func configForceSegmentTypeVersion(config map[string]interface{}) (string, uint32, error) { + forcedSegmentVersion, err := parseToInteger(config["forceSegmentVersion"]) + if err != nil { + return "", 0, nil + } -func (s *Scorch) incrPause() { - s.pauseLock.Lock() - s.pauseCount++ - s.pauseLock.Unlock() + forcedSegmentType, ok := config["forceSegmentType"].(string) + if !ok { + return "", 0, fmt.Errorf( + "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) + } + + return forcedSegmentType, uint32(forcedSegmentVersion), nil } -func (s *Scorch) decrPause() { - s.pauseLock.Lock() - s.pauseCount-- - s.pauseLock.Unlock() +func (s *Scorch) NumEventsBlocking() uint64 { + eventsCompleted := atomic.LoadUint64(&s.stats.TotEventTriggerCompleted) + eventsStarted := atomic.LoadUint64(&s.stats.TotEventTriggerStarted) + return eventsStarted - eventsCompleted } func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { if s.onEvent != nil { - s.incrPause() + atomic.AddUint64(&s.stats.TotEventTriggerStarted, 1) s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) - s.decrPause() + atomic.AddUint64(&s.stats.TotEventTriggerCompleted, 1) } } @@ -181,7 +180,7 @@ func (s *Scorch) Open() error { } s.asyncTasks.Add(1) - go s.mainLoop() + go s.introducerLoop() if !s.readOnly && s.path != "" { s.asyncTasks.Add(1) @@ -241,6 +240,7 @@ func (s *Scorch) openBolt() error { s.introducerNotifier = make(chan *epochWatcher, 1) s.persisterNotifier = make(chan *epochWatcher, 1) s.closeCh = make(chan struct{}) + s.forceMergeRequestCh = make(chan *mergerCtrl, 1) if !s.readOnly && s.path != "" { err := s.removeOldZapFiles() // Before persister or merger create any new files. @@ -567,6 +567,10 @@ func (s *Scorch) StatsMap() map[string]interface{} { } func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { + return analyze(d) +} + +func analyze(d *document.Document) *index.AnalysisResult { rv := &index.AnalysisResult{ Document: d, Analyzed: make([]analysis.TokenFrequencies, len(d.Fields)+len(d.CompositeFields)), diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment/unadorned.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment/unadorned.go index 9a4d6c76c..db06562df 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment/unadorned.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment/unadorned.go @@ -24,7 +24,6 @@ var reflectStaticSizeUnadornedPostingsIteratorBitmap int var reflectStaticSizeUnadornedPostingsIterator1Hit int var reflectStaticSizeUnadornedPosting int - func init() { var pib UnadornedPostingsIteratorBitmap reflectStaticSizeUnadornedPostingsIteratorBitmap = int(reflect.TypeOf(pib).Size()) @@ -34,7 +33,7 @@ func init() { reflectStaticSizeUnadornedPosting = int(reflect.TypeOf(up).Size()) } -type UnadornedPostingsIteratorBitmap struct{ +type UnadornedPostingsIteratorBitmap struct { actual roaring.IntPeekable actualBM *roaring.Bitmap } @@ -72,16 +71,29 @@ func (i *UnadornedPostingsIteratorBitmap) Size() int { return reflectStaticSizeUnadornedPostingsIteratorBitmap } +func (i *UnadornedPostingsIteratorBitmap) ActualBitmap() *roaring.Bitmap { + return i.actualBM +} + +func (i *UnadornedPostingsIteratorBitmap) DocNum1Hit() (uint64, bool) { + return 0, false +} + +func (i *UnadornedPostingsIteratorBitmap) ReplaceActual(actual *roaring.Bitmap) { + i.actualBM = actual + i.actual = actual.Iterator() +} + func NewUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) PostingsIterator { return &UnadornedPostingsIteratorBitmap{ actualBM: bm, - actual: bm.Iterator(), + actual: bm.Iterator(), } } const docNum1HitFinished = math.MaxUint64 -type UnadornedPostingsIterator1Hit struct{ +type UnadornedPostingsIterator1Hit struct { docNum uint64 } @@ -145,4 +157,4 @@ func (p UnadornedPosting) Locations() []Location { func (p UnadornedPosting) Size() int { return reflectStaticSizeUnadornedPosting -} \ No newline at end of file +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/segment_plugin.go b/vendor/github.com/blevesearch/bleve/index/scorch/segment_plugin.go index 01eda7fbd..b830b2c05 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/segment_plugin.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/segment_plugin.go @@ -21,6 +21,8 @@ import ( zapv11 "github.com/blevesearch/zap/v11" zapv12 "github.com/blevesearch/zap/v12" + zapv13 "github.com/blevesearch/zap/v13" + zapv14 "github.com/blevesearch/zap/v14" ) var supportedSegmentPlugins map[string]map[uint32]segment.Plugin @@ -28,6 +30,8 @@ var defaultSegmentPlugin segment.Plugin func init() { ResetPlugins() + RegisterPlugin(zapv14.Plugin(), false) + RegisterPlugin(zapv13.Plugin(), false) RegisterPlugin(zapv12.Plugin(), false) RegisterPlugin(zapv11.Plugin(), true) } @@ -60,18 +64,28 @@ func SupportedSegmentTypeVersions(typ string) (rv []uint32) { return rv } -func (s *Scorch) loadSegmentPlugin(forcedSegmentType string, - forcedSegmentVersion uint32) error { +func chooseSegmentPlugin(forcedSegmentType string, + forcedSegmentVersion uint32) (segment.Plugin, error) { if versions, ok := supportedSegmentPlugins[forcedSegmentType]; ok { if segPlugin, ok := versions[uint32(forcedSegmentVersion)]; ok { - s.segPlugin = segPlugin - return nil + return segPlugin, nil } - return fmt.Errorf( + return nil, fmt.Errorf( "unsupported version %d for segment type: %s, supported: %v", forcedSegmentVersion, forcedSegmentType, SupportedSegmentTypeVersions(forcedSegmentType)) } - return fmt.Errorf("unsupported segment type: %s, supported: %v", + return nil, fmt.Errorf("unsupported segment type: %s, supported: %v", forcedSegmentType, SupportedSegmentTypes()) } + +func (s *Scorch) loadSegmentPlugin(forcedSegmentType string, + forcedSegmentVersion uint32) error { + segPlugin, err := chooseSegmentPlugin(forcedSegmentType, + forcedSegmentVersion) + if err != nil { + return err + } + s.segPlugin = segPlugin + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go index 47cc809b2..9d17bcb2c 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go @@ -303,9 +303,12 @@ func (i *IndexSnapshot) newDocIDReader(results chan *asynchSegmentResult) (index var err error for count := 0; count < len(i.segment); count++ { asr := <-results - if asr.err != nil && err != nil { - err = asr.err - } else { + if asr.err != nil { + if err == nil { + // returns the first error encountered + err = asr.err + } + } else if err == nil { rv.iterators[asr.index] = asr.docs.Iterator() } } @@ -511,10 +514,20 @@ func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnaps } } i.m2.Unlock() - return &IndexSnapshotTermFieldReader{} + return &IndexSnapshotTermFieldReader{ + recycle: true, + } } func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { + if !tfr.recycle { + // Do not recycle an optimized unadorned term field reader (used for + // ConjunctionUnadorned or DisjunctionUnadorned), during when a fresh + // roaring.Bitmap is built by AND-ing or OR-ing individual bitmaps, + // and we'll need to release them for GC. (See MB-40916) + return + } + i.parent.rootLock.RLock() obsolete := i.parent.root != i i.parent.rootLock.RUnlock() diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go index 5d56f1944..239f68fbe 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go @@ -45,6 +45,7 @@ type IndexSnapshotTermFieldReader struct { includeTermVectors bool currPosting segment.Posting currID index.IndexInternalID + recycle bool } func (i *IndexSnapshotTermFieldReader) Size() int { @@ -133,6 +134,8 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo if err != nil { return nil, err } + // close the current term field reader before replacing it with a new one + _ = i.Close() *i = *(i2.(*IndexSnapshotTermFieldReader)) } num, err := docInternalToNumber(ID) diff --git a/vendor/github.com/blevesearch/bleve/index/scorch/stats.go b/vendor/github.com/blevesearch/bleve/index/scorch/stats.go index e638362a7..626fff2e4 100644 --- a/vendor/github.com/blevesearch/bleve/index/scorch/stats.go +++ b/vendor/github.com/blevesearch/bleve/index/scorch/stats.go @@ -47,6 +47,9 @@ type Stats struct { TotTermSearchersStarted uint64 TotTermSearchersFinished uint64 + TotEventTriggerStarted uint64 + TotEventTriggerCompleted uint64 + TotIntroduceLoop uint64 TotIntroduceSegmentBeg uint64 TotIntroduceSegmentEnd uint64 @@ -82,6 +85,9 @@ type Stats struct { TotFileMergeLoopErr uint64 TotFileMergeLoopEnd uint64 + TotFileMergeForceOpsStarted uint64 + TotFileMergeForceOpsCompleted uint64 + TotFileMergePlan uint64 TotFileMergePlanErr uint64 TotFileMergePlanNone uint64 @@ -105,9 +111,10 @@ type Stats struct { TotFileMergeZapIntroductionTime uint64 MaxFileMergeZapIntroductionTime uint64 - TotFileMergeIntroductions uint64 - TotFileMergeIntroductionsDone uint64 - TotFileMergeIntroductionsSkipped uint64 + TotFileMergeIntroductions uint64 + TotFileMergeIntroductionsDone uint64 + TotFileMergeIntroductionsSkipped uint64 + TotFileMergeIntroductionsObsoleted uint64 CurFilesIneligibleForRemoval uint64 TotSnapshotsRemovedFromMetaStore uint64 diff --git a/vendor/github.com/blevesearch/bleve/index_alias_impl.go b/vendor/github.com/blevesearch/bleve/index_alias_impl.go index 4366fc795..5aa57d8ac 100644 --- a/vendor/github.com/blevesearch/bleve/index_alias_impl.go +++ b/vendor/github.com/blevesearch/bleve/index_alias_impl.go @@ -16,7 +16,6 @@ package bleve import ( "context" - "sort" "sync" "time" @@ -44,6 +43,16 @@ func NewIndexAlias(indexes ...Index) *indexAliasImpl { } } +// VisitIndexes invokes the visit callback on every +// indexes included in the index alias. +func (i *indexAliasImpl) VisitIndexes(visit func(Index)) { + i.mutex.RLock() + for _, idx := range i.indexes { + visit(idx) + } + i.mutex.RUnlock() +} + func (i *indexAliasImpl) isAliasToSingleIndex() error { if len(i.indexes) < 1 { return ErrorAliasEmpty @@ -511,10 +520,11 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se } } + sortFunc := req.SortFunc() // sort all hits with the requested order if len(req.Sort) > 0 { sorter := newSearchHitSorter(req.Sort, sr.Hits) - sort.Sort(sorter) + sortFunc(sorter) } // now skip over the correct From @@ -539,7 +549,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, sr.Hits) - sort.Sort(mhs) + sortFunc(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil diff --git a/vendor/github.com/blevesearch/bleve/index_impl.go b/vendor/github.com/blevesearch/bleve/index_impl.go index 6324d960e..629cc9b2f 100644 --- a/vendor/github.com/blevesearch/bleve/index_impl.go +++ b/vendor/github.com/blevesearch/bleve/index_impl.go @@ -19,7 +19,6 @@ import ( "encoding/json" "fmt" "os" - "sort" "sync" "sync/atomic" "time" @@ -579,7 +578,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, hits) - sort.Sort(mhs) + req.SortFunc()(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil diff --git a/vendor/github.com/blevesearch/bleve/mapping/document.go b/vendor/github.com/blevesearch/bleve/mapping/document.go index 15cb6b5fa..355a602e5 100644 --- a/vendor/github.com/blevesearch/bleve/mapping/document.go +++ b/vendor/github.com/blevesearch/bleve/mapping/document.go @@ -251,7 +251,6 @@ func (dm *DocumentMapping) AddFieldMapping(fm *FieldMapping) { // UnmarshalJSON offers custom unmarshaling with optional strict validation func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { - var tmp map[string]json.RawMessage err := json.Unmarshal(data, &tmp) if err != nil { @@ -308,8 +307,8 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { } func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { - rv := "" current := dm + rv := current.DefaultAnalyzer for _, pathElement := range path { var ok bool current, ok = current.Properties[pathElement] diff --git a/vendor/github.com/blevesearch/bleve/mapping/index.go b/vendor/github.com/blevesearch/bleve/mapping/index.go index 602764cbb..21ca5cce3 100644 --- a/vendor/github.com/blevesearch/bleve/mapping/index.go +++ b/vendor/github.com/blevesearch/bleve/mapping/index.go @@ -101,26 +101,26 @@ func (im *IndexMappingImpl) AddCustomTokenFilter(name string, config map[string] // returned analyzer is registered in the IndexMapping. // // bleve comes with predefined analyzers, like -// github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer. They are +// github.com/blevesearch/bleve/analysis/analyzer/custom. They are // available only if their package is imported by client code. To achieve this, // use their metadata to fill configuration entries: // // import ( -// "github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer" -// "github.com/blevesearch/bleve/analysis/char_filters/html_char_filter" -// "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" -// "github.com/blevesearch/bleve/analysis/tokenizers/unicode" +// "github.com/blevesearch/bleve/analysis/analyzer/custom" +// "github.com/blevesearch/bleve/analysis/char/html" +// "github.com/blevesearch/bleve/analysis/token/lowercase" +// "github.com/blevesearch/bleve/analysis/tokenizer/unicode" // ) // // m := bleve.NewIndexMapping() // err := m.AddCustomAnalyzer("html", map[string]interface{}{ -// "type": custom_analyzer.Name, +// "type": custom.Name, // "char_filters": []string{ -// html_char_filter.Name, +// html.Name, // }, // "tokenizer": unicode.Name, // "token_filters": []string{ -// lower_case_filter.Name, +// lowercase.Name, // ... // }, // }) diff --git a/vendor/github.com/blevesearch/bleve/search.go b/vendor/github.com/blevesearch/bleve/search.go index b337edc9e..f67450779 100644 --- a/vendor/github.com/blevesearch/bleve/search.go +++ b/vendor/github.com/blevesearch/bleve/search.go @@ -18,6 +18,7 @@ import ( "encoding/json" "fmt" "reflect" + "sort" "time" "github.com/blevesearch/bleve/analysis" @@ -264,6 +265,7 @@ func (h *HighlightRequest) AddField(field string) { // Score controls the kind of scoring performed // SearchAfter supports deep paging by providing a minimum sort key // SearchBefore supports deep paging by providing a maximum sort key +// sortFunc specifies the sort implementation to use for sorting results. // // A special field named "*" can be used to return all fields. type SearchRequest struct { @@ -279,6 +281,8 @@ type SearchRequest struct { Score string `json:"score,omitempty"` SearchAfter []string `json:"search_after"` SearchBefore []string `json:"search_before"` + + sortFunc func(sort.Interface) } func (r *SearchRequest) Validate() error { @@ -606,3 +610,22 @@ func MemoryNeededForSearchResult(req *SearchRequest) uint64 { return uint64(estimate) } + +// SetSortFunc sets the sort implementation to use when sorting hits. +// +// SearchRequests can specify a custom sort implementation to meet +// their needs. For instance, by specifying a parallel sort +// that uses all available cores. +func (r *SearchRequest) SetSortFunc(s func(sort.Interface)) { + r.sortFunc = s +} + +// SortFunc returns the sort implementation to use when sorting hits. +// Defaults to sort.Sort. +func (r *SearchRequest) SortFunc() func(data sort.Interface) { + if r.sortFunc != nil { + return r.sortFunc + } + + return sort.Sort +} diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go index 6a296b68f..f47da27c4 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction.go @@ -16,7 +16,6 @@ package searcher import ( "fmt" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -37,6 +36,11 @@ func NewDisjunctionSearcher(indexReader index.IndexReader, return newDisjunctionSearcher(indexReader, qsearchers, min, options, true) } +func optionsDisjunctionOptimizable(options search.SearcherOptions) bool { + rv := options.Score == "none" && !options.IncludeTermVectors + return rv +} + func newDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, limit bool) (search.Searcher, error) { @@ -44,7 +48,7 @@ func newDisjunctionSearcher(indexReader index.IndexReader, // do not need extra information like freq-norm's or term vectors // and the requested min is simple if len(qsearchers) > 1 && min <= 1 && - options.Score == "none" && !options.IncludeTermVectors { + optionsDisjunctionOptimizable(options) { rv, err := optimizeCompositeSearcher("disjunction:unadorned", indexReader, qsearchers, options) if err != nil || rv != nil { @@ -103,7 +107,7 @@ func tooManyClauses(count int) bool { return false } -func tooManyClausesErr(count int) error { - return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]", - count, DisjunctionMaxClauseCount) +func tooManyClausesErr(field string, count int) error { + return fmt.Errorf("TooManyClauses over field: `%s` [%d > maxClauseCount,"+ + " which is set to %d]", field, count, DisjunctionMaxClauseCount) } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go index ec133f1f8..7f0a5a00e 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go @@ -62,7 +62,7 @@ func newDisjunctionHeapSearcher(indexReader index.IndexReader, limit bool) ( *DisjunctionHeapSearcher, error) { if limit && tooManyClauses(len(searchers)) { - return nil, tooManyClausesErr(len(searchers)) + return nil, tooManyClausesErr("", len(searchers)) } // build our searcher @@ -310,7 +310,7 @@ func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableCo } } - return octx, nil + return nil, nil } // heap impl diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go index e47f39ad0..dc566ade5 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go @@ -50,7 +50,7 @@ func newDisjunctionSliceSearcher(indexReader index.IndexReader, limit bool) ( *DisjunctionSliceSearcher, error) { if limit && tooManyClauses(len(qsearchers)) { - return nil, tooManyClausesErr(len(qsearchers)) + return nil, tooManyClausesErr("", len(qsearchers)) } // build the downstream searchers searchers := make(OrderedSearcherList, len(qsearchers)) @@ -294,5 +294,5 @@ func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableC } } - return octx, nil + return nil, nil } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go index 8176e59b5..aca8a7d9f 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_fuzzy.go @@ -75,7 +75,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, for err == nil && tfd != nil { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr(len(rv)) + return nil, tooManyClausesErr(field, len(rv)) } tfd, err = fieldDict.Next() } @@ -107,7 +107,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if !exceeded && ld <= fuzziness { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr(len(rv)) + return nil, tooManyClausesErr(field, len(rv)) } } tfd, err = fieldDict.Next() diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go index c4b8af927..76157f01a 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_geoboundingbox.go @@ -24,7 +24,7 @@ import ( type filterFunc func(key []byte) bool -var GeoBitsShift1 = (geo.GeoBits << 1) +var GeoBitsShift1 = geo.GeoBits << 1 var GeoBitsShift1Minus1 = GeoBitsShift1 - 1 func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, @@ -100,30 +100,42 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, var geoMaxShift = document.GeoPrecisionStep * 4 var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 +type closeFunc func() error func ComputeGeoRange(term uint64, shift uint, sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool, indexReader index.IndexReader, field string) ( onBoundary [][]byte, notOnBoundary [][]byte, err error) { - preallocBytesLen := 32 - preallocBytes := make([]byte, preallocBytesLen) - makePrefixCoded := func(in int64, shift uint) (rv numeric.PrefixCoded) { - if len(preallocBytes) <= 0 { - preallocBytesLen = preallocBytesLen * 2 - preallocBytes = make([]byte, preallocBytesLen) - } - - rv, preallocBytes, err = - numeric.NewPrefixCodedInt64Prealloc(in, shift, preallocBytes) + isIndexed, closeF, err := buildIsIndexedFunc(indexReader, field) + if closeF != nil { + defer func() { + cerr := closeF() + if cerr != nil { + err = cerr + } + }() + } - return rv + grc := &geoRangeCompute{ + preallocBytesLen: 32, + preallocBytes: make([]byte, 32), + sminLon: sminLon, + sminLat: sminLat, + smaxLon: smaxLon, + smaxLat: smaxLat, + checkBoundaries: checkBoundaries, + isIndexed: isIndexed, } - var fieldDict index.FieldDictContains - var isIndexed filterFunc + grc.computeGeoRange(term, shift) + + return grc.onBoundary, grc.notOnBoundary, nil +} + +func buildIsIndexedFunc(indexReader index.IndexReader, field string) (isIndexed filterFunc, closeF closeFunc, err error) { if irr, ok := indexReader.(index.IndexReaderContains); ok { - fieldDict, err = irr.FieldDictContains(field) + fieldDict, err := irr.FieldDictContains(field) if err != nil { return nil, nil, err } @@ -132,22 +144,18 @@ func ComputeGeoRange(term uint64, shift uint, found, err := fieldDict.Contains(term) return err == nil && found } - } - defer func() { - if fieldDict != nil { + closeF = func() error { if fd, ok := fieldDict.(index.FieldDict); ok { - cerr := fd.Close() - if cerr != nil { - err = cerr + err := fd.Close() + if err != nil { + return err } } + return nil } - }() - - if isIndexed == nil { + } else if indexReader != nil { isIndexed = func(term []byte) bool { - if indexReader != nil { reader, err := indexReader.TermFieldReader(term, field, false, false, false) if err != nil || reader == nil { return false @@ -157,68 +165,15 @@ func ComputeGeoRange(term uint64, shift uint, return false } _ = reader.Close() - } - return true + return true } - } - var computeGeoRange func(term uint64, shift uint) // declare for recursion - - relateAndRecurse := func(start, end uint64, res, level uint) { - minLon := geo.MortonUnhashLon(start) - minLat := geo.MortonUnhashLat(start) - maxLon := geo.MortonUnhashLon(end) - maxLat := geo.MortonUnhashLat(end) - - within := res%document.GeoPrecisionStep == 0 && - geo.RectWithin(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat) - if within || (level == geoDetailLevel && - geo.RectIntersects(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat)) { - codedTerm := makePrefixCoded(int64(start), res) - if isIndexed(codedTerm) { - if !within && checkBoundaries { - onBoundary = append(onBoundary, codedTerm) - } else { - notOnBoundary = append(notOnBoundary, codedTerm) - } - } - } else if level < geoDetailLevel && - geo.RectIntersects(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat) { - computeGeoRange(start, res-1) + } else { + isIndexed = func([]byte) bool { + return true } } - - computeGeoRange = func(term uint64, shift uint) { - if err != nil { - return - } - - split := term | uint64(0x1)<> 1 - - relateAndRecurse(term, lowerMax, shift, level) - relateAndRecurse(split, upperMax, shift, level) - } - - computeGeoRange(term, shift) - - if err != nil { - return nil, nil, err - } - - return onBoundary, notOnBoundary, err + return isIndexed, closeF, err } func buildRectFilter(dvReader index.DocValueReader, field string, @@ -252,3 +207,66 @@ func buildRectFilter(dvReader index.DocValueReader, field string, return false } } + +type geoRangeCompute struct { + preallocBytesLen int + preallocBytes []byte + sminLon, sminLat, smaxLon, smaxLat float64 + checkBoundaries bool + onBoundary, notOnBoundary [][]byte + isIndexed func(term []byte) bool +} + +func (grc *geoRangeCompute) makePrefixCoded(in int64, shift uint) (rv numeric.PrefixCoded) { + if len(grc.preallocBytes) <= 0 { + grc.preallocBytesLen = grc.preallocBytesLen * 2 + grc.preallocBytes = make([]byte, grc.preallocBytesLen) + } + + rv, grc.preallocBytes, _ = + numeric.NewPrefixCodedInt64Prealloc(in, shift, grc.preallocBytes) + + return rv +} + +func (grc *geoRangeCompute) computeGeoRange(term uint64, shift uint) { + split := term | uint64(0x1)<> 1 + + within := res%document.GeoPrecisionStep == 0 && + geo.RectWithin(minLon, minLat, maxLon, maxLat, + grc.sminLon, grc.sminLat, grc.smaxLon, grc.smaxLat) + if within || (level == geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + grc.sminLon, grc.sminLat, grc.smaxLon, grc.smaxLat)) { + codedTerm := grc.makePrefixCoded(int64(start), res) + if grc.isIndexed(codedTerm) { + if !within && grc.checkBoundaries { + grc.onBoundary = append(grc.onBoundary, codedTerm) + } else { + grc.notOnBoundary = append(grc.notOnBoundary, codedTerm) + } + } + } else if level < geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + grc.sminLon, grc.sminLat, grc.smaxLon, grc.smaxLat) { + grc.computeGeoRange(start, res-1) + } +} \ No newline at end of file diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go index c48366ee2..70a2fa38c 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_multi_term.go @@ -15,6 +15,7 @@ package searcher import ( + "fmt" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -22,10 +23,113 @@ import ( func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { - if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + + if tooManyClauses(len(terms)) { + if optionsDisjunctionOptimizable(options) { + return optimizeMultiTermSearcher(indexReader, terms, field, boost, options) + } + if limit { + return nil, tooManyClausesErr(field, len(terms)) + } + } + + qsearchers, err := makeBatchSearchers(indexReader, terms, field, boost, options) + if err != nil { + return nil, err } + // build disjunction searcher of these ranges + return newMultiTermSearcherInternal(indexReader, qsearchers, field, boost, + options, limit) +} + +func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, + field string, boost float64, options search.SearcherOptions, limit bool) ( + search.Searcher, error) { + + if tooManyClauses(len(terms)) { + if optionsDisjunctionOptimizable(options) { + return optimizeMultiTermSearcherBytes(indexReader, terms, field, boost, options) + } + + if limit { + return nil, tooManyClausesErr(field, len(terms)) + } + } + + qsearchers, err := makeBatchSearchersBytes(indexReader, terms, field, boost, options) + if err != nil { + return nil, err + } + + // build disjunction searcher of these ranges + return newMultiTermSearcherInternal(indexReader, qsearchers, field, boost, + options, limit) +} + +func newMultiTermSearcherInternal(indexReader index.IndexReader, + searchers []search.Searcher, field string, boost float64, + options search.SearcherOptions, limit bool) ( + search.Searcher, error) { + + // build disjunction searcher of these ranges + searcher, err := newDisjunctionSearcher(indexReader, searchers, 0, options, + limit) + if err != nil { + for _, s := range searchers { + _ = s.Close() + } + return nil, err + } + + return searcher, nil +} + +func optimizeMultiTermSearcher(indexReader index.IndexReader, terms []string, + field string, boost float64, options search.SearcherOptions) ( + search.Searcher, error) { + var finalSearcher search.Searcher + for len(terms) > 0 { + var batchTerms []string + if len(terms) > DisjunctionMaxClauseCount { + batchTerms = terms[:DisjunctionMaxClauseCount] + terms = terms[DisjunctionMaxClauseCount:] + } else { + batchTerms = terms + terms = nil + } + batch, err := makeBatchSearchers(indexReader, batchTerms, field, boost, options) + if err != nil { + return nil, err + } + if finalSearcher != nil { + batch = append(batch, finalSearcher) + } + cleanup := func() { + for _, searcher := range batch { + if searcher != nil { + _ = searcher.Close() + } + } + } + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned", + indexReader, batch, options) + // all searchers in batch should be closed, regardless of error or optimization failure + // either we're returning, or continuing and only finalSearcher is needed for next loop + cleanup() + if err != nil { + return nil, err + } + if finalSearcher == nil { + return nil, fmt.Errorf("unable to optimize") + } + } + return finalSearcher, nil +} + +func makeBatchSearchers(indexReader index.IndexReader, terms []string, field string, + boost float64, options search.SearcherOptions) ([]search.Searcher, error) { + qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { for _, searcher := range qsearchers { @@ -42,17 +146,54 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, return nil, err } } - // build disjunction searcher of these ranges - return newMultiTermSearcherBytes(indexReader, qsearchers, field, boost, - options, limit) + return qsearchers, nil } -func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, - field string, boost float64, options search.SearcherOptions, limit bool) ( +func optimizeMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, + field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { - if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + + var finalSearcher search.Searcher + for len(terms) > 0 { + var batchTerms [][]byte + if len(terms) > DisjunctionMaxClauseCount { + batchTerms = terms[:DisjunctionMaxClauseCount] + terms = terms[DisjunctionMaxClauseCount:] + } else { + batchTerms = terms + terms = nil + } + batch, err := makeBatchSearchersBytes(indexReader, batchTerms, field, boost, options) + if err != nil { + return nil, err + } + if finalSearcher != nil { + batch = append(batch, finalSearcher) + } + cleanup := func() { + for _, searcher := range batch { + if searcher != nil { + _ = searcher.Close() + } + } + } + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned", + indexReader, batch, options) + // all searchers in batch should be closed, regardless of error or optimization failure + // either we're returning, or continuing and only finalSearcher is needed for next loop + cleanup() + if err != nil { + return nil, err + } + if finalSearcher == nil { + return nil, fmt.Errorf("unable to optimize") + } } + return finalSearcher, nil +} + +func makeBatchSearchersBytes(indexReader index.IndexReader, terms [][]byte, field string, + boost float64, options search.SearcherOptions) ([]search.Searcher, error) { qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { @@ -70,24 +211,5 @@ func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, return nil, err } } - return newMultiTermSearcherBytes(indexReader, qsearchers, field, boost, - options, limit) -} - -func newMultiTermSearcherBytes(indexReader index.IndexReader, - searchers []search.Searcher, field string, boost float64, - options search.SearcherOptions, limit bool) ( - search.Searcher, error) { - - // build disjunction searcher of these ranges - searcher, err := newDisjunctionSearcher(indexReader, searchers, 0, options, - limit) - if err != nil { - for _, s := range searchers { - _ = s.Close() - } - return nil, err - } - - return searcher, nil + return qsearchers, nil } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go index 83107f020..48d6226e1 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_numeric_range.go @@ -74,9 +74,8 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, terms := termRanges.Enumerate(isIndexed) if fieldDict != nil { if fd, ok := fieldDict.(index.FieldDict); ok { - cerr := fd.Close() - if cerr != nil { - err = cerr + if err = fd.Close(); err != nil { + return nil, err } } } @@ -97,7 +96,7 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, } if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + return nil, tooManyClausesErr(field, len(terms)) } return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options, diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go index 4def832c4..11a44f159 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_regexp.go @@ -110,7 +110,7 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return rv, tooManyClausesErr(len(rv)) + return rv, tooManyClausesErr(field, len(rv)) } } tfd, err = fieldDict.Next() diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go index c1af74c76..e07d25333 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_term.go @@ -137,5 +137,5 @@ func (s *TermSearcher) Optimize(kind string, octx index.OptimizableContext) ( return o.Optimize(kind, octx) } - return octx, nil + return nil, nil } diff --git a/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go b/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go index b5af4631f..2a8f22cff 100644 --- a/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go +++ b/vendor/github.com/blevesearch/bleve/search/searcher/search_term_prefix.go @@ -38,7 +38,7 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, for err == nil && tfd != nil { terms = append(terms, tfd.Term) if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + return nil, tooManyClausesErr(field, len(terms)) } tfd, err = fieldDict.Next() } diff --git a/vendor/github.com/blevesearch/bleve/search/sort.go b/vendor/github.com/blevesearch/bleve/search/sort.go index 6e4ed80fa..dca422ebd 100644 --- a/vendor/github.com/blevesearch/bleve/search/sort.go +++ b/vendor/github.com/blevesearch/bleve/search/sort.go @@ -233,7 +233,11 @@ func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatc } else { iVal := i.Sort[x] jVal := j.Sort[x] - c = strings.Compare(iVal, jVal) + if iVal < jVal { + c = -1 + } else if iVal > jVal { + c = 1 + } } if c == 0 { @@ -423,7 +427,8 @@ func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { allTermsPrefixCoded = false } } - if allTermsPrefixCoded { + // reset the terms only when valid zero shift terms are found. + if allTermsPrefixCoded && len(termsWithShiftZero) > 0 { terms = termsWithShiftZero s.tmp = termsWithShiftZero[:0] } diff --git a/vendor/github.com/blevesearch/zap/v11/go.mod b/vendor/github.com/blevesearch/zap/v11/go.mod index a238188e6..bbf090563 100644 --- a/vendor/github.com/blevesearch/zap/v11/go.mod +++ b/vendor/github.com/blevesearch/zap/v11/go.mod @@ -3,10 +3,10 @@ module github.com/blevesearch/zap/v11 go 1.12 require ( - github.com/RoaringBitmap/roaring v0.4.21 - github.com/blevesearch/bleve v1.0.7 + github.com/RoaringBitmap/roaring v0.4.23 + github.com/blevesearch/bleve v1.0.10 github.com/blevesearch/mmap-go v1.0.2 - github.com/couchbase/vellum v1.0.1 + github.com/couchbase/vellum v1.0.2 github.com/golang/snappy v0.0.1 github.com/spf13/cobra v0.0.5 ) diff --git a/vendor/github.com/blevesearch/zap/v12/go.mod b/vendor/github.com/blevesearch/zap/v12/go.mod index 24bbcb2c1..14488fcb3 100644 --- a/vendor/github.com/blevesearch/zap/v12/go.mod +++ b/vendor/github.com/blevesearch/zap/v12/go.mod @@ -3,10 +3,10 @@ module github.com/blevesearch/zap/v12 go 1.12 require ( - github.com/RoaringBitmap/roaring v0.4.21 - github.com/blevesearch/bleve v1.0.7 + github.com/RoaringBitmap/roaring v0.4.23 + github.com/blevesearch/bleve v1.0.10 github.com/blevesearch/mmap-go v1.0.2 - github.com/couchbase/vellum v1.0.1 + github.com/couchbase/vellum v1.0.2 github.com/golang/snappy v0.0.1 github.com/spf13/cobra v0.0.5 ) diff --git a/vendor/github.com/blevesearch/zap/v13/.gitignore b/vendor/github.com/blevesearch/zap/v13/.gitignore new file mode 100644 index 000000000..46d1cfad5 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/.gitignore @@ -0,0 +1,12 @@ +#* +*.sublime-* +*~ +.#* +.project +.settings +**/.idea/ +**/*.iml +.DS_Store +/cmd/zap/zap +*.test +tags diff --git a/vendor/github.com/blevesearch/zap/v13/LICENSE b/vendor/github.com/blevesearch/zap/v13/LICENSE new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/vendor/github.com/blevesearch/zap/v13/README.md b/vendor/github.com/blevesearch/zap/v13/README.md new file mode 100644 index 000000000..0facb669f --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/README.md @@ -0,0 +1,158 @@ +# zap file format + +Advanced ZAP File Format Documentation is [here](zap.md). + +The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. + +Current usage: + +- mmap the entire file +- crc-32 bytes and version are in fixed position at end of the file +- reading remainder of footer could be version specific +- remainder of footer gives us: + - 3 important offsets (docValue , fields index and stored data index) + - 2 important values (number of docs and chunk factor) +- field data is processed once and memoized onto the heap so that we never have to go back to disk for it +- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. +- access to all other indexed data follows the following pattern: + - first know the field name -> convert to id + - next navigate to term dictionary for that field + - some operations stop here and do dictionary ops + - next use dictionary to navigate to posting list for a specific term + - walk posting list + - if necessary, walk posting details as we go + - if location info is desired, consult location bitmap to see if it is there + +## stored fields section + +- for each document + - preparation phase: + - produce a slice of metadata bytes and data bytes + - produce these slices in field id order + - field value is appended to the data slice + - metadata slice is varint encoded with the following values for each field value + - field id (uint16) + - field type (byte) + - field value start offset in uncompressed data slice (uint64) + - field value length (uint64) + - field number of array positions (uint64) + - one additional value for each array position (uint64) + - compress the data slice using snappy + - file writing phase: + - remember the start offset for this document + - write out meta data length (varint uint64) + - write out compressed data length (varint uint64) + - write out the metadata bytes + - write out the compressed data bytes + +## stored fields idx + +- for each document + - write start offset (remembered from previous section) of stored data (big endian uint64) + +With this index and a known document number, we have direct access to all the stored field data. + +## posting details (freq/norm) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode term frequency (uint64) + - encode norm factor (float32) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## posting details (location) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode field (uint16) + - encode field pos (uint64) + - encode field start (uint64) + - encode field end (uint64) + - encode number of array positions to follow (uint64) + - encode each array position (each uint64) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## postings list section + +- for each posting list + - preparation phase: + - encode roaring bitmap posting list to bytes (so we know the length) + - file writing phase: + - remember the start position for this posting list + - write freq/norm details offset (remembered from previous, as varint uint64) + - write location details offset (remembered from previous, as varint uint64) + - write length of encoded roaring bitmap + - write the serialized roaring bitmap data + +## dictionary + +- for each field + - preparation phase: + - encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous) + - file writing phase: + - remember the start position of this persistDictionary + - write length of vellum data (varint uint64) + - write out vellum data + +## fields section + +- for each field + - file writing phase: + - remember start offset for each field + - write dictionary address (remembered from previous) (varint uint64) + - write length of field name (varint uint64) + - write field name bytes + +## fields idx + +- for each field + - file writing phase: + - write big endian uint64 of start offset for each field + +NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. + +## fields DocValue + +- for each field + - preparation phase: + - produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data + - produce a slice remembering the length of each chunk + - file writing phase: + - remember the start position of this first field DocValue offset in the footer + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any +read operation leverage that meta information to extract the document specific data from the file. + +## footer + +- file writing phase + - write number of docs (big endian uint64) + - write stored field index location (big endian uint64) + - write field index location (big endian uint64) + - write field docValue location (big endian uint64) + - write out chunk factor (big endian uint32) + - write out version (big endian uint32) + - write out file CRC of everything preceding this (big endian uint32) diff --git a/vendor/github.com/blevesearch/zap/v13/build.go b/vendor/github.com/blevesearch/zap/v13/build.go new file mode 100644 index 000000000..58d829f0c --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/build.go @@ -0,0 +1,156 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "math" + "os" + + "github.com/couchbase/vellum" +) + +const Version uint32 = 13 + +const Type string = "zap" + +const fieldNotUninverted = math.MaxUint64 + +func (sb *SegmentBase) Persist(path string) error { + return PersistSegmentBase(sb, path) +} + +// PersistSegmentBase persists SegmentBase in the zap file format. +func PersistSegmentBase(sb *SegmentBase, path string) error { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + br := bufio.NewWriter(f) + + _, err = br.Write(sb.mem) + if err != nil { + cleanup() + return err + } + + err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset, + sb.chunkMode, sb.memCRC, br) + if err != nil { + cleanup() + return err + } + + err = br.Flush() + if err != nil { + cleanup() + return err + } + + err = f.Sync() + if err != nil { + cleanup() + return err + } + + err = f.Close() + if err != nil { + cleanup() + return err + } + + return nil +} + +func persistStoredFieldValues(fieldID int, + storedFieldValues [][]byte, stf []byte, spf [][]uint64, + curr int, metaEncode varintEncoder, data []byte) ( + int, []byte, error) { + for i := 0; i < len(storedFieldValues); i++ { + // encode field + _, err := metaEncode(uint64(fieldID)) + if err != nil { + return 0, nil, err + } + // encode type + _, err = metaEncode(uint64(stf[i])) + if err != nil { + return 0, nil, err + } + // encode start offset + _, err = metaEncode(uint64(curr)) + if err != nil { + return 0, nil, err + } + // end len + _, err = metaEncode(uint64(len(storedFieldValues[i]))) + if err != nil { + return 0, nil, err + } + // encode number of array pos + _, err = metaEncode(uint64(len(spf[i]))) + if err != nil { + return 0, nil, err + } + // encode all array positions + for _, pos := range spf[i] { + _, err = metaEncode(pos) + if err != nil { + return 0, nil, err + } + } + + data = append(data, storedFieldValues[i]...) + curr += len(storedFieldValues[i]) + } + + return curr, data, nil +} + +func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, + fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, + storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, + dictLocs []uint64) (*SegmentBase, error) { + sb := &SegmentBase{ + mem: mem, + memCRC: memCRC, + chunkMode: chunkMode, + fieldsMap: fieldsMap, + fieldsInv: fieldsInv, + numDocs: numDocs, + storedIndexOffset: storedIndexOffset, + fieldsIndexOffset: fieldsIndexOffset, + docValueOffset: docValueOffset, + dictLocs: dictLocs, + fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), + } + sb.updateSize() + + err := sb.loadDvReaders() + if err != nil { + return nil, err + } + + return sb, nil +} diff --git a/vendor/github.com/blevesearch/zap/v13/chunk.go b/vendor/github.com/blevesearch/zap/v13/chunk.go new file mode 100644 index 000000000..fe9f398da --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/chunk.go @@ -0,0 +1,54 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" +) + +// LegacyChunkMode was the original chunk mode (always chunk size 1024) +// this mode is still used for chunking doc values. +var LegacyChunkMode uint32 = 1024 + +// DefaultChunkMode is the most recent improvement to chunking and should +// be used by default. +var DefaultChunkMode uint32 = 1025 + +func getChunkSize(chunkMode uint32, cardinality uint64, maxDocs uint64) (uint64, error) { + switch { + // any chunkMode <= 1024 will always chunk with chunkSize=chunkMode + case chunkMode <= 1024: + // legacy chunk size + return uint64(chunkMode), nil + + case chunkMode == 1025: + // attempt at simple improvement + // theory - the point of chunking is to put a bound on the maximum number of + // calls to Next() needed to find a random document. ie, you should be able + // to do one jump to the correct chunk, and then walk through at most + // chunk-size items + // previously 1024 was chosen as the chunk size, but this is particularly + // wasteful for low cardinality terms. the observation is that if there + // are less than 1024 items, why not put them all in one chunk, + // this way you'll still achieve the same goal of visiting at most + // chunk-size items. + // no attempt is made to tweak any other case + if cardinality <= 1024 { + return maxDocs, nil + } + return 1024, nil + } + return 0, fmt.Errorf("unknown chunk mode %d", chunkMode) +} diff --git a/vendor/github.com/blevesearch/zap/v13/contentcoder.go b/vendor/github.com/blevesearch/zap/v13/contentcoder.go new file mode 100644 index 000000000..c145b5a11 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/contentcoder.go @@ -0,0 +1,243 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + "reflect" + + "github.com/golang/snappy" +) + +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + +var termSeparator byte = 0xff +var termSeparatorSplitSlice = []byte{termSeparator} + +type chunkedContentCoder struct { + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + + w io.Writer + progressiveWrite bool + + chunkMetaBuf bytes.Buffer + chunkBuf bytes.Buffer + + chunkMeta []MetaData + + compressed []byte // temp buf for snappy compression +} + +// MetaData represents the data information inside a +// chunk. +type MetaData struct { + DocNum uint64 // docNum of the data inside the chunk + DocDvOffset uint64 // offset of data inside the chunk for the given docid +} + +// newChunkedContentCoder returns a new chunk content coder which +// packs data into chunks based on the provided chunkSize +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, + w io.Writer, progressiveWrite bool) *chunkedContentCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedContentCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: make([]MetaData, 0, total), + w: w, + progressiveWrite: progressiveWrite, + } + + return rv +} + +// Reset lets you reuse this chunked content coder. Buffers are reset +// and re used. You cannot change the chunk size. +func (c *chunkedContentCoder) Reset() { + c.currChunk = 0 + c.final = c.final[:0] + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } + c.chunkMeta = c.chunkMeta[:0] +} + +func (c *chunkedContentCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } + if cap(c.chunkMeta) < total { + c.chunkMeta = make([]MetaData, 0, total) + } +} + +// Close indicates you are done calling Add() this allows +// the final chunk to be encoded. +func (c *chunkedContentCoder) Close() error { + return c.flushContents() +} + +func (c *chunkedContentCoder) flushContents() error { + // flush the contents, with meta information at first + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(len(c.chunkMeta))) + _, err := c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + // write out the metaData slice + for _, meta := range c.chunkMeta { + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) + if err != nil { + return err + } + } + + // write the metadata to final data + metaData := c.chunkMetaBuf.Bytes() + c.final = append(c.final, c.chunkMetaBuf.Bytes()...) + // write the compressed data to the final data + c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) + c.final = append(c.final, c.compressed...) + + c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) + + if c.progressiveWrite { + _, err := c.w.Write(c.final) + if err != nil { + return err + } + c.final = c.final[:0] + } + + return nil +} + +// Add encodes the provided byte slice into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // flush out the previous chunk details + err := c.flushContents() + if err != nil { + return err + } + // clearing the chunk specific meta for next chunk + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + c.chunkMeta = c.chunkMeta[:0] + c.currChunk = chunk + } + + // get the starting offset for this doc + dvOffset := c.chunkBuf.Len() + dvSize, err := c.chunkBuf.Write(vals) + if err != nil { + return err + } + + c.chunkMeta = append(c.chunkMeta, MetaData{ + DocNum: docNum, + DocDvOffset: uint64(dvOffset + dvSize), + }) + return nil +} + +// Write commits all the encoded chunked contents to the provided writer. +// +// | ..... data ..... | chunk offsets (varints) +// | position of chunk offsets (uint64) | number of offsets (uint64) | +// +func (c *chunkedContentCoder) Write() (int, error) { + var tw int + + if c.final != nil { + // write out the data section first + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsStart := uint64(tw) + + if cap(c.final) < binary.MaxVarintLen64 { + c.final = make([]byte, binary.MaxVarintLen64) + } else { + c.final = c.final[0:binary.MaxVarintLen64] + } + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(c.final, chunkOffset) + nw, err := c.w.Write(c.final[:n]) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsLen := uint64(tw) - chunkOffsetsStart + + c.final = c.final[0:8] + // write out the length of chunk offsets + binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + // write out the number of chunks + binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) + nw, err = c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + c.final = c.final[:0] + + return tw, nil +} + +// ReadDocValueBoundary elicits the start, end offsets from a +// metaData header slice +func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = metaHeaders[chunk-1].DocDvOffset + } + return start, metaHeaders[chunk].DocDvOffset +} diff --git a/vendor/github.com/blevesearch/zap/v13/count.go b/vendor/github.com/blevesearch/zap/v13/count.go new file mode 100644 index 000000000..50290f888 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/count.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "hash/crc32" + "io" + + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// CountHashWriter is a wrapper around a Writer which counts the number of +// bytes which have been written and computes a crc32 hash +type CountHashWriter struct { + w io.Writer + crc uint32 + n int + s segment.StatsReporter +} + +// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer +func NewCountHashWriter(w io.Writer) *CountHashWriter { + return &CountHashWriter{w: w} +} + +func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { + return &CountHashWriter{w: w, s: s} +} + +// Write writes the provided bytes to the wrapped writer and counts the bytes +func (c *CountHashWriter) Write(b []byte) (int, error) { + n, err := c.w.Write(b) + c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) + c.n += n + if c.s != nil { + c.s.ReportBytesWritten(uint64(n)) + } + return n, err +} + +// Count returns the number of bytes written +func (c *CountHashWriter) Count() int { + return c.n +} + +// Sum32 returns the CRC-32 hash of the content written to this writer +func (c *CountHashWriter) Sum32() uint32 { + return c.crc +} diff --git a/vendor/github.com/blevesearch/zap/v13/dict.go b/vendor/github.com/blevesearch/zap/v13/dict.go new file mode 100644 index 000000000..ad4a8f8dc --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/dict.go @@ -0,0 +1,263 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "fmt" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" +) + +// Dictionary is the zap representation of the term dictionary +type Dictionary struct { + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + fstReader *vellum.Reader +} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { + var preallocPL *PostingsList + pl, ok := prealloc.(*PostingsList) + if ok && pl != nil { + preallocPL = pl + } + return d.postingsList(term, except, preallocPL) +} + +func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + if d.fstReader == nil { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + postingsOffset, exists, err := d.fstReader.Get(term) + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + return d.postingsListFromOffset(postingsOffset, except, rv) +} + +func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + rv = d.postingsListInit(rv, except) + + err := rv.read(postingsOffset, d) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { + if rv == nil || rv == emptyPostingsList { + rv = &PostingsList{} + } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + + *rv = PostingsList{} // clear the struct + + rv.postings = postings + } + rv.sb = d.sb + rv.except = except + return rv +} + +func (d *Dictionary) Contains(key []byte) (bool, error) { + return d.fst.Contains(key) +} + +// Iterator returns an iterator for this dictionary +func (d *Dictionary) Iterator() segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Iterator(nil, nil) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// PrefixIterator returns an iterator which only visits terms having the +// the specified prefix +func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + kBeg := []byte(prefix) + kEnd := segment.IncrementBytes(kBeg) + + if d.fst != nil { + itr, err := d.fst.Iterator(kBeg, kEnd) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// RangeIterator returns an iterator which only visits terms between the +// start and end terms. NOTE: bleve.index API specifies the end is inclusive. +func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + // need to increment the end position to be inclusive + var endBytes []byte + if len(end) > 0 { + endBytes = []byte(end) + if endBytes[len(endBytes)-1] < 0xff { + endBytes[len(endBytes)-1]++ + } else { + endBytes = append(endBytes, 0xff) + } + } + + if d.fst != nil { + itr, err := d.fst.Iterator([]byte(start), endBytes) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (d *Dictionary) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, + includeCount bool) segment.DictionaryIterator { + + rv := &DictionaryIterator{ + d: d, + omitCount: !includeCount, + } + + var buf bytes.Buffer + builder, err := vellum.New(&buf, nil) + if err != nil { + rv.err = err + return rv + } + for _, term := range onlyTerms { + err = builder.Insert(term, 0) + if err != nil { + rv.err = err + return rv + } + } + err = builder.Close() + if err != nil { + rv.err = err + return rv + } + + onlyFST, err := vellum.Load(buf.Bytes()) + if err != nil { + rv.err = err + return rv + } + + itr, err := d.fst.Search(onlyFST, nil, nil) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + + return rv +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry + omitCount bool +} + +// Next returns the next entry in the dictionary +func (i *DictionaryIterator) Next() (*index.DictEntry, error) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } + term, postingsOffset := i.itr.Current() + i.entry.Term = string(term) + if !i.omitCount { + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } + i.entry.Count = i.tmp.Count() + } + i.err = i.itr.Next() + return &i.entry, nil +} diff --git a/vendor/github.com/blevesearch/zap/v13/docvalues.go b/vendor/github.com/blevesearch/zap/v13/docvalues.go new file mode 100644 index 000000000..793797bd8 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/docvalues.go @@ -0,0 +1,312 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "reflect" + "sort" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" + "github.com/golang/snappy" +) + +var reflectStaticSizedocValueReader int + +func init() { + var dvi docValueReader + reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) +} + +type docNumTermsVisitor func(docNum uint64, terms []byte) error + +type docVisitState struct { + dvrs map[uint16]*docValueReader + segment *SegmentBase +} + +type docValueReader struct { + field string + curChunkNum uint64 + chunkOffsets []uint64 + dvDataLoc uint64 + curChunkHeader []MetaData + curChunkData []byte // compressed data cache + uncompressed []byte // temp buf for snappy decompression +} + +func (di *docValueReader) size() int { + return reflectStaticSizedocValueReader + size.SizeOfPtr + + len(di.field) + + len(di.chunkOffsets)*size.SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) +} + +func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { + if rv == nil { + rv = &docValueReader{} + } + + rv.field = di.field + rv.curChunkNum = math.MaxUint64 + rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable + rv.dvDataLoc = di.dvDataLoc + rv.curChunkHeader = rv.curChunkHeader[:0] + rv.curChunkData = nil + rv.uncompressed = rv.uncompressed[:0] + + return rv +} + +func (di *docValueReader) curChunkNumber() uint64 { + return di.curChunkNum +} + +func (s *SegmentBase) loadFieldDocValueReader(field string, + fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { + // get the docValue offset for the given fields + if fieldDvLocStart == fieldNotUninverted { + // no docValues found, nothing to do + return nil, nil + } + + // read the number of chunks, and chunk offsets position + var numChunks, chunkOffsetsPosition uint64 + + if fieldDvLocEnd-fieldDvLocStart > 16 { + numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen + } else { + return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) + } + + fdvIter := &docValueReader{ + curChunkNum: math.MaxUint64, + field: field, + chunkOffsets: make([]uint64, int(numChunks)), + } + + // read the chunk offsets + var offset uint64 + for i := 0; i < int(numChunks); i++ { + loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) + if read <= 0 { + return nil, fmt.Errorf("corrupted chunk offset during segment load") + } + fdvIter.chunkOffsets[i] = loc + offset += uint64(read) + } + + // set the data offset + fdvIter.dvDataLoc = fieldDvLocStart + + return fdvIter, nil +} + +func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { + // advance to the chunk where the docValues + // reside for the given docNum + destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc + start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } + + destChunkDataLoc += start + curChunkEnd += end + + // read the number of docs reside in the chunk + numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("failed to read the chunk") + } + chunkMetaLoc := destChunkDataLoc + uint64(read) + + offset := uint64(0) + if cap(di.curChunkHeader) < int(numDocs) { + di.curChunkHeader = make([]MetaData, int(numDocs)) + } else { + di.curChunkHeader = di.curChunkHeader[:int(numDocs)] + } + for i := 0; i < int(numDocs); i++ { + di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + } + + compressedDataLoc := chunkMetaLoc + offset + dataLength := curChunkEnd - compressedDataLoc + di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil +} + +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + if di.curChunkData == nil || len(di.curChunkHeader) == 0 { + continue + } + + // uncompress the already loaded data + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + + return nil +} + +func (di *docValueReader) visitDocValues(docNum uint64, + visitor index.DocumentFieldTermVisitor) error { + // binary search the term locations for the docNum + start, end := di.getDocValueLocs(docNum) + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { + return nil + } + + var uncompressed []byte + var err error + // use the uncompressed copy if available + if len(di.uncompressed) > 0 { + uncompressed = di.uncompressed + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + } + + // pick the terms for the given docNum + uncompressed = uncompressed[start:end] + for { + i := bytes.Index(uncompressed, termSeparatorSplitSlice) + if i < 0 { + break + } + + visitor(di.field, uncompressed[0:i]) + uncompressed = uncompressed[i+1:] + } + + return nil +} + +func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { + i := sort.Search(len(di.curChunkHeader), func(i int) bool { + return di.curChunkHeader[i].DocNum >= docNum + }) + if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { + return ReadDocValueBoundary(i, di.curChunkHeader) + } + return math.MaxUint64, math.MaxUint64 +} + +// VisitDocumentFieldTerms is an implementation of the +// DocumentFieldTermVisitable interface +func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( + segment.DocVisitState, error) { + dvs, ok := dvsIn.(*docVisitState) + if !ok || dvs == nil { + dvs = &docVisitState{} + } else { + if dvs.segment != s { + dvs.segment = s + dvs.dvrs = nil + } + } + + var fieldIDPlus1 uint16 + if dvs.dvrs == nil { + dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvIter, exists := s.fieldDvReaders[fieldID]; exists && + dvIter != nil { + dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) + } + } + } + + // find the chunkNumber where the docValues are stored + // NOTE: doc values continue to use legacy chunk mode + chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, err + } + docInChunk := localDocNum / chunkFactor + var dvr *docValueReader + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { + // check if the chunk is already loaded + if docInChunk != dvr.curChunkNumber() { + err := dvr.loadDvChunk(docInChunk, s) + if err != nil { + return dvs, err + } + } + + _ = dvr.visitDocValues(localDocNum, visitor) + } + } + return dvs, nil +} + +// VisitableDocValueFields returns the list of fields with +// persisted doc value terms ready to be visitable using the +// VisitDocumentFieldTerms method. +func (s *SegmentBase) VisitableDocValueFields() ([]string, error) { + return s.fieldDvNames, nil +} diff --git a/vendor/github.com/blevesearch/zap/v13/enumerator.go b/vendor/github.com/blevesearch/zap/v13/enumerator.go new file mode 100644 index 000000000..bc5b7e627 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/enumerator.go @@ -0,0 +1,138 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + + "github.com/couchbase/vellum" +) + +// enumerator provides an ordered traversal of multiple vellum +// iterators. Like JOIN of iterators, the enumerator produces a +// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC, +// then iteratorIndex ASC, where the same key might be seen or +// repeated across multiple child iterators. +type enumerator struct { + itrs []vellum.Iterator + currKs [][]byte + currVs []uint64 + + lowK []byte + lowIdxs []int + lowCurr int +} + +// newEnumerator returns a new enumerator over the vellum Iterators +func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { + rv := &enumerator{ + itrs: itrs, + currKs: make([][]byte, len(itrs)), + currVs: make([]uint64, len(itrs)), + lowIdxs: make([]int, 0, len(itrs)), + } + for i, itr := range rv.itrs { + rv.currKs[i], rv.currVs[i] = itr.Current() + } + rv.updateMatches(false) + if rv.lowK == nil && len(rv.lowIdxs) == 0 { + return rv, vellum.ErrIteratorDone + } + return rv, nil +} + +// updateMatches maintains the low key matches based on the currKs +func (m *enumerator) updateMatches(skipEmptyKey bool) { + m.lowK = nil + m.lowIdxs = m.lowIdxs[:0] + m.lowCurr = 0 + + for i, key := range m.currKs { + if (key == nil && m.currVs[i] == 0) || // in case of empty iterator + (len(key) == 0 && skipEmptyKey) { // skip empty keys + continue + } + + cmp := bytes.Compare(key, m.lowK) + if cmp < 0 || len(m.lowIdxs) == 0 { + // reached a new low + m.lowK = key + m.lowIdxs = m.lowIdxs[:0] + m.lowIdxs = append(m.lowIdxs, i) + } else if cmp == 0 { + m.lowIdxs = append(m.lowIdxs, i) + } + } +} + +// Current returns the enumerator's current key, iterator-index, and +// value. If the enumerator is not pointing at a valid value (because +// Next returned an error previously), Current will return nil,0,0. +func (m *enumerator) Current() ([]byte, int, uint64) { + var i int + var v uint64 + if m.lowCurr < len(m.lowIdxs) { + i = m.lowIdxs[m.lowCurr] + v = m.currVs[i] + } + return m.lowK, i, v +} + +// GetLowIdxsAndValues will return all of the iterator indices +// which point to the current key, and their corresponding +// values. This can be used by advanced caller which may need +// to peek into these other sets of data before processing. +func (m *enumerator) GetLowIdxsAndValues() ([]int, []uint64) { + values := make([]uint64, 0, len(m.lowIdxs)) + for _, idx := range m.lowIdxs { + values = append(values, m.currVs[idx]) + } + return m.lowIdxs, values +} + +// Next advances the enumerator to the next key/iterator/value result, +// else vellum.ErrIteratorDone is returned. +func (m *enumerator) Next() error { + m.lowCurr += 1 + if m.lowCurr >= len(m.lowIdxs) { + // move all the current low iterators forwards + for _, vi := range m.lowIdxs { + err := m.itrs[vi].Next() + if err != nil && err != vellum.ErrIteratorDone { + return err + } + m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() + } + // can skip any empty keys encountered at this point + m.updateMatches(true) + } + if m.lowK == nil && len(m.lowIdxs) == 0 { + return vellum.ErrIteratorDone + } + return nil +} + +// Close all the underlying Iterators. The first error, if any, will +// be returned. +func (m *enumerator) Close() error { + var rv error + for _, itr := range m.itrs { + err := itr.Close() + if rv == nil { + rv = err + } + } + return rv +} diff --git a/vendor/github.com/blevesearch/zap/v13/go.mod b/vendor/github.com/blevesearch/zap/v13/go.mod new file mode 100644 index 000000000..10349f109 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/go.mod @@ -0,0 +1,12 @@ +module github.com/blevesearch/zap/v13 + +go 1.12 + +require ( + github.com/RoaringBitmap/roaring v0.4.23 + github.com/blevesearch/bleve v1.0.10 + github.com/blevesearch/mmap-go v1.0.2 + github.com/couchbase/vellum v1.0.2 + github.com/golang/snappy v0.0.1 + github.com/spf13/cobra v0.0.5 +) diff --git a/vendor/github.com/blevesearch/zap/v13/intDecoder.go b/vendor/github.com/blevesearch/zap/v13/intDecoder.go new file mode 100644 index 000000000..4cd008ff2 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/intDecoder.go @@ -0,0 +1,111 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type chunkedIntDecoder struct { + startOffset uint64 + dataStartOffset uint64 + chunkOffsets []uint64 + curChunkBytes []byte + data []byte + r *segment.MemUvarintReader +} + +func newChunkedIntDecoder(buf []byte, offset uint64) *chunkedIntDecoder { + rv := &chunkedIntDecoder{startOffset: offset, data: buf} + var n, numChunks uint64 + var read int + if offset == termNotEncoded { + numChunks = 0 + } else { + numChunks, read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + } + + n += uint64(read) + if cap(rv.chunkOffsets) >= int(numChunks) { + rv.chunkOffsets = rv.chunkOffsets[:int(numChunks)] + } else { + rv.chunkOffsets = make([]uint64, int(numChunks)) + } + for i := 0; i < int(numChunks); i++ { + rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.dataStartOffset = offset + n + return rv +} + +func (d *chunkedIntDecoder) loadChunk(chunk int) error { + if d.startOffset == termNotEncoded { + d.r = segment.NewMemUvarintReader([]byte(nil)) + return nil + } + + if chunk >= len(d.chunkOffsets) { + return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", + chunk, len(d.chunkOffsets)) + } + + end, start := d.dataStartOffset, d.dataStartOffset + s, e := readChunkBoundary(chunk, d.chunkOffsets) + start += s + end += e + d.curChunkBytes = d.data[start:end] + if d.r == nil { + d.r = segment.NewMemUvarintReader(d.curChunkBytes) + } else { + d.r.Reset(d.curChunkBytes) + } + + return nil +} + +func (d *chunkedIntDecoder) reset() { + d.startOffset = 0 + d.dataStartOffset = 0 + d.chunkOffsets = d.chunkOffsets[:0] + d.curChunkBytes = d.curChunkBytes[:0] + d.data = d.data[:0] + if d.r != nil { + d.r.Reset([]byte(nil)) + } +} + +func (d *chunkedIntDecoder) isNil() bool { + return d.curChunkBytes == nil +} + +func (d *chunkedIntDecoder) readUvarint() (uint64, error) { + return d.r.ReadUvarint() +} + +func (d *chunkedIntDecoder) SkipUvarint() { + d.r.SkipUvarint() +} + +func (d *chunkedIntDecoder) SkipBytes(count int) { + d.r.SkipBytes(count) +} + +func (d *chunkedIntDecoder) Len() int { + return d.r.Len() +} diff --git a/vendor/github.com/blevesearch/zap/v13/intcoder.go b/vendor/github.com/blevesearch/zap/v13/intcoder.go new file mode 100644 index 000000000..c3c488fb7 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/intcoder.go @@ -0,0 +1,206 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" +) + +// We can safely use 0 to represent termNotEncoded since 0 +// could never be a valid address for term location information. +// (stored field index is always non-empty and earlier in the +// file) +const termNotEncoded = 0 + +type chunkedIntCoder struct { + final []byte + chunkSize uint64 + chunkBuf bytes.Buffer + chunkLens []uint64 + currChunk uint64 + + buf []byte +} + +// newChunkedIntCoder returns a new chunk int coder which packs data into +// chunks based on the provided chunkSize and supports up to the specified +// maxDocNum +func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedIntCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + final: make([]byte, 0, 64), + } + + return rv +} + +// Reset lets you reuse this chunked int coder. buffers are reset and reused +// from previous use. you cannot change the chunk size or max doc num. +func (c *chunkedIntCoder) Reset() { + c.final = c.final[:0] + c.chunkBuf.Reset() + c.currChunk = 0 + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } +} + +// SetChunkSize changes the chunk size. It is only valid to do so +// with a new chunkedIntCoder, or immediately after calling Reset() +func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } +} + +// Add encodes the provided integers into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + if len(c.buf) < binary.MaxVarintLen64 { + c.buf = make([]byte, binary.MaxVarintLen64) + } + + for _, val := range vals { + wb := binary.PutUvarint(c.buf, val) + _, err := c.chunkBuf.Write(c.buf[:wb]) + if err != nil { + return err + } + } + + return nil +} + +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + +// Close indicates you are done calling Add() this allows the final chunk +// to be encoded. +func (c *chunkedIntCoder) Close() { + encodingBytes := c.chunkBuf.Bytes() + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) + c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close +} + +// Write commits all the encoded chunked integers to the provided writer. +func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { + bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens)) + if len(c.buf) < bufNeeded { + c.buf = make([]byte, bufNeeded) + } + buf := c.buf + + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) + } + + tw, err := w.Write(buf[:n]) + if err != nil { + return tw, err + } + + // write out the data + nw, err := w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} + +// writeAt commits all the encoded chunked integers to the provided writer +// and returns the starting offset, total bytes written and an error +func (c *chunkedIntCoder) writeAt(w io.Writer) (uint64, int, error) { + startOffset := uint64(termNotEncoded) + if len(c.final) <= 0 { + return startOffset, 0, nil + } + + if chw := w.(*CountHashWriter); chw != nil { + startOffset = uint64(chw.Count()) + } + + tw, err := c.Write(w) + return startOffset, tw, err +} + +func (c *chunkedIntCoder) FinalSize() int { + return len(c.final) +} + +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary +// will figure out the start and end of every chunk from +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ + } + return lengths +} + +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = offsets[chunk-1] + } + return start, offsets[chunk] +} diff --git a/vendor/github.com/blevesearch/zap/v13/merge.go b/vendor/github.com/blevesearch/zap/v13/merge.go new file mode 100644 index 000000000..805100fb5 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/merge.go @@ -0,0 +1,847 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "math" + "os" + "sort" + + "github.com/RoaringBitmap/roaring" + seg "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +var DefaultFileMergerBufferSize = 1024 * 1024 + +const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc + +// Merge takes a slice of segments and bit masks describing which +// documents may be dropped, and creates a new segment containing the +// remaining data. This new segment is built at the specified path. +func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + switch segmentx := segment.(type) { + case *Segment: + segmentBases[segmenti] = &segmentx.SegmentBase + case *SegmentBase: + segmentBases[segmenti] = segmentx + default: + panic(fmt.Sprintf("oops, unexpected segment type: %T", segment)) + } + } + return mergeSegmentBases(segmentBases, drops, path, DefaultChunkMode, closeCh, s) +} + +func mergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, + chunkMode uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return nil, 0, err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + // buffer the output + br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriterWithStatsReporter(br, s) + + newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := + MergeToWriter(segmentBases, drops, chunkMode, cr, closeCh) + if err != nil { + cleanup() + return nil, 0, err + } + + err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, + docValueOffset, chunkMode, cr.Sum32(), cr) + if err != nil { + cleanup() + return nil, 0, err + } + + err = br.Flush() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Sync() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Close() + if err != nil { + cleanup() + return nil, 0, err + } + + return newDocNums, uint64(cr.Count()), nil +} + +func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, + chunkMode uint32, cr *CountHashWriter, closeCh chan struct{}) ( + newDocNums [][]uint64, + numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, + err error) { + docValueOffset = uint64(fieldNotUninverted) + + var fieldsSame bool + fieldsSame, fieldsInv = mergeFields(segments) + fieldsMap = mapFields(fieldsInv) + + numDocs = computeNewDocCount(segments, drops) + + if isClosed(closeCh) { + return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed + } + + if numDocs > 0 { + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) + if err != nil { + return nil, 0, 0, 0, 0, nil, nil, nil, err + } + + dictLocs, docValueOffset, err = persistMergedRest(segments, drops, + fieldsInv, fieldsMap, fieldsSame, + newDocNums, numDocs, chunkMode, cr, closeCh) + if err != nil { + return nil, 0, 0, 0, 0, nil, nil, nil, err + } + } else { + dictLocs = make([]uint64, len(fieldsInv)) + } + + fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) + if err != nil { + return nil, 0, 0, 0, 0, nil, nil, nil, err + } + + return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, nil +} + +// mapFields takes the fieldsInv list and returns a map of fieldName +// to fieldID+1 +func mapFields(fields []string) map[string]uint16 { + rv := make(map[string]uint16, len(fields)) + for i, fieldName := range fields { + rv[fieldName] = uint16(i) + 1 + } + return rv +} + +// computeNewDocCount determines how many documents will be in the newly +// merged segment when obsoleted docs are dropped +func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 { + var newDocCount uint64 + for segI, segment := range segments { + newDocCount += segment.numDocs + if drops[segI] != nil { + newDocCount -= drops[segI].GetCardinality() + } + } + return newDocCount +} + +func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, + newDocNumsIn [][]uint64, newSegDocCount uint64, chunkMode uint32, + w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { + + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 + + var postings *PostingsList + var postItr *PostingsIterator + + rv := make([]uint64, len(fieldsInv)) + fieldDvLocsStart := make([]uint64, len(fieldsInv)) + fieldDvLocsEnd := make([]uint64, len(fieldsInv)) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + + var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, 0, err + } + + newRoaring := roaring.NewBitmap() + + // for each field + for fieldID, fieldName := range fieldsInv { + + // collect FST iterators from all active segments for this field + var newDocNums [][]uint64 + var drops []*roaring.Bitmap + var dicts []*Dictionary + var itrs []vellum.Iterator + + var segmentsInFocus []*SegmentBase + + for segmentI, segment := range segments { + + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + dict, err2 := segment.dictionary(fieldName) + if err2 != nil { + return nil, 0, err2 + } + if dict != nil && dict.fst != nil { + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil && err2 != vellum.ErrIteratorDone { + return nil, 0, err2 + } + if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } + dicts = append(dicts, dict) + itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) + } + } + } + + var prevTerm []byte + + newRoaring.Clear() + + var lastDocNum, lastFreq, lastNorm uint64 + + // determines whether to use "1-hit" encoding optimization + // when a term appears in only 1 doc, with no loc info, + // has freq of 1, and the docNum fits into 31-bits + use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { + if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { + docNum := uint64(newRoaring.Minimum()) + if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { + return true, docNum, lastNorm + } + } + return false, 0, 0 + } + + finishTerm := func(term []byte) error { + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + if err != nil { + return err + } + + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) + if err != nil { + return err + } + } + + newRoaring.Clear() + + tfEncoder.Reset() + locEncoder.Reset() + + lastDocNum = 0 + lastFreq = 0 + lastNorm = 0 + + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + // if the term changed, write out the info collected + // for the previous term + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + } + if !bytes.Equal(prevTerm, term) || prevTerm == nil { + // compute cardinality of field-term in new seg + var newCard uint64 + lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues() + for i, idx := range lowItrIdxs { + pl, err := dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil) + if err != nil { + return nil, 0, err + } + newCard += pl.Count() + } + // compute correct chunk size with this + chunkSize, err := getChunkSize(chunkMode, newCard, newSegDocCount) + if err != nil { + return nil, 0, err + } + // update encoders chunk + tfEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + locEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + } + + postings, err = dicts[itrI].postingsListFromOffset( + postingsOffset, drops[itrI], postings) + if err != nil { + return nil, 0, err + } + + postItr = postings.iterator(true, true, true, postItr) + + // can no longer optimize by copying, since chunk factor could have changed + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder, bufLoc) + + if err != nil { + return nil, 0, err + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) + + err = enumerator.Next() + } + if err != vellum.ErrIteratorDone { + return nil, 0, err + } + + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + + dictOffset := uint64(w.Count()) + + err = newVellum.Close() + if err != nil { + return nil, 0, err + } + vellumData := vellumBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, 0, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, 0, err + } + + rv[fieldID] = dictOffset + + // get the field doc value offset (start) + fieldDvLocsStart[fieldID] = uint64(w.Count()) + + // update the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, 0, err + } + fdvEncoder := newChunkedContentCoder(chunkSize, newSegDocCount-1, w, true) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + for segmentI, segment := range segmentsInFocus { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && + dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) + if err != nil { + return nil, 0, err + } + } + } + + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } + + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, 0, err + } + } + + fieldDvLocsOffset := uint64(w.Count()) + + buf := bufMaxVarintLen64 + for i := 0; i < len(fieldDvLocsStart); i++ { + n := binary.PutUvarint(buf, fieldDvLocsStart[i]) + _, err := w.Write(buf[:n]) + if err != nil { + return nil, 0, err + } + n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, 0, err + } + } + + return rv, fieldDvLocsOffset, nil +} + +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + nextNorm := uint64(math.Float32bits(float32(next.Norm()))) + + locs := next.Locations() + + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + if err != nil { + return 0, 0, 0, nil, err + } + + if len(locs) > 0 { + numBytesLocs := 0 + for _, loc := range locs { + ap := loc.ArrayPositions() + numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), + loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) + } + + err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) + if err != nil { + return 0, 0, 0, nil, err + } + + for _, loc := range locs { + ap := loc.ArrayPositions() + if cap(bufLoc) < 5+len(ap) { + bufLoc = make([]uint64, 0, 5+len(ap)) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(ap)) + args = append(args, ap...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, + use1HitEncoding func(uint64) (bool, uint64, uint64), + w *CountHashWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + if use1HitEncoding != nil { + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil + } + } + + var tfOffset uint64 + tfOffset, _, err = tfEncoder.writeAt(w) + if err != nil { + return 0, err + } + + var locOffset uint64 + locOffset, _, err = locEncoder.writeAt(w) + if err != nil { + return 0, err + } + + postingsOffset := uint64(w.Count()) + + n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +type varintEncoder func(uint64) (int, error) + +func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, + fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, + w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { + var rv [][]uint64 // The remapped or newDocNums for each segment. + + var newDocNum uint64 + + var curr int + var data, compressed []byte + var metaBuf bytes.Buffer + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return metaBuf.Write(varBuf[:wb]) + } + + vals := make([][][]byte, len(fieldsInv)) + typs := make([][]byte, len(fieldsInv)) + poss := make([][][]uint64, len(fieldsInv)) + + var posBuf []uint64 + + docNumOffsets := make([]uint64, newSegDocCount) + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + + // for each segment + for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + + segNewDocNums := make([]uint64, segment.numDocs) + + dropsI := drops[segI] + + // optimize when the field mapping is the same across all + // segments and there are no deletions, via byte-copying + // of stored docs bytes directly to the writer + if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) { + err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) + if err != nil { + return 0, nil, err + } + + for i := uint64(0); i < segment.numDocs; i++ { + segNewDocNums[i] = newDocNum + newDocNum++ + } + rv = append(rv, segNewDocNums) + + continue + } + + // for each doc num + for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + // TODO: roaring's API limits docNums to 32-bits? + if dropsI != nil && dropsI.Contains(uint32(docNum)) { + segNewDocNums[docNum] = docDropped + continue + } + + segNewDocNums[docNum] = newDocNum + + curr = 0 + metaBuf.Reset() + data = data[:0] + + posTemp := posBuf + + // collect all the data + for i := 0; i < len(fieldsInv); i++ { + vals[i] = vals[i][:0] + typs[i] = typs[i][:0] + poss[i] = poss[i][:0] + } + err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := int(fieldsMap[field]) - 1 + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + + // copy array positions to preserve them beyond the scope of this callback + var curPos []uint64 + if len(pos) > 0 { + if cap(posTemp) < len(pos) { + posBuf = make([]uint64, len(pos)*len(fieldsInv)) + posTemp = posBuf + } + curPos = posTemp[0:len(pos)] + copy(curPos, pos) + posTemp = posTemp[len(pos):] + } + poss[fieldID] = append(poss[fieldID], curPos) + + return true + }) + if err != nil { + return 0, nil, err + } + + // _id field special case optimizes ExternalID() lookups + idFieldVal := vals[uint16(0)][0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, nil, err + } + + // now walk the non-"_id" fields in order + for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { + storedFieldValues := vals[fieldID] + + stf := typs[fieldID] + spf := poss[fieldID] + + var err2 error + curr, data, err2 = persistStoredFieldValues(fieldID, + storedFieldValues, stf, spf, curr, metaEncode, data) + if err2 != nil { + return 0, nil, err2 + } + } + + metaBytes := metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) + + // write out the meta len and compressed data len + _, err = writeUvarints(w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, nil, err + } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, nil, err + } + // now write the _id field val (counted as part of the 'compressed' data) + _, err = w.Write(idFieldVal) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ + } + + rv = append(rv, segNewDocNums) + } + + // return value is the start of the stored index + storedIndexOffset := uint64(w.Count()) + + // now write out the stored doc index + for _, docNumOffset := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffset) + if err != nil { + return 0, nil, err + } + } + + return storedIndexOffset, rv, nil +} + +// copyStoredDocs writes out a segment's stored doc info, optimized by +// using a single Write() call for the entire set of bytes. The +// newDocNumOffsets is filled with the new offsets for each doc. +func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64, + w *CountHashWriter) error { + if s.numDocs <= 0 { + return nil + } + + indexOffset0, storedOffset0, _, _, _ := + s.getDocStoredOffsets(0) // the segment's first doc + + indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN := + s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc + + storedOffset0New := uint64(w.Count()) + + storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN] + _, err := w.Write(storedBytes) + if err != nil { + return err + } + + // remap the storedOffset's for the docs into new offsets relative + // to storedOffset0New, filling the given docNumOffsetsOut array + for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 { + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New + newDocNumOffsets[newDocNum] = storedOffsetNew + newDocNum += 1 + } + + return nil +} + +// mergeFields builds a unified list of fields used across all the +// input segments, and computes whether the fields are the same across +// segments (which depends on fields to be sorted in the same way +// across segments) +func mergeFields(segments []*SegmentBase) (bool, []string) { + fieldsSame := true + + var segment0Fields []string + if len(segments) > 0 { + segment0Fields = segments[0].Fields() + } + + fieldsExist := map[string]struct{}{} + for _, segment := range segments { + fields := segment.Fields() + for fieldi, field := range fields { + fieldsExist[field] = struct{}{} + if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { + fieldsSame = false + } + } + } + + rv := make([]string, 0, len(fieldsExist)) + // ensure _id stays first + rv = append(rv, "_id") + for k := range fieldsExist { + if k != "_id" { + rv = append(rv, k) + } + } + + sort.Strings(rv[1:]) // leave _id as first + + return fieldsSame, rv +} + +func isClosed(closeCh chan struct{}) bool { + select { + case <-closeCh: + return true + default: + return false + } +} diff --git a/vendor/github.com/blevesearch/zap/v13/new.go b/vendor/github.com/blevesearch/zap/v13/new.go new file mode 100644 index 000000000..98158186c --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/new.go @@ -0,0 +1,860 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +var NewSegmentBufferNumResultsBump int = 100 +var NewSegmentBufferNumResultsFactor float64 = 1.0 +var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 + +// ValidateDocFields can be set by applications to perform additional checks +// on fields in a document being added to a new segment, by default it does +// nothing. +// This API is experimental and may be removed at any time. +var ValidateDocFields = func(field document.Field) error { + return nil +} + +// AnalysisResultsToSegmentBase produces an in-memory zap-encoded +// SegmentBase from analysis results +func (z *ZapPlugin) New(results []*index.AnalysisResult) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode) +} + +func (*ZapPlugin) newWithChunkMode(results []*index.AnalysisResult, + chunkMode uint32) (segment.Segment, uint64, error) { + s := interimPool.Get().(*interim) + + var br bytes.Buffer + if s.lastNumDocs > 0 { + // use previous results to initialize the buf with an estimate + // size, but note that the interim instance comes from a + // global interimPool, so multiple scorch instances indexing + // different docs can lead to low quality estimates + estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * + NewSegmentBufferNumResultsFactor) + estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * + NewSegmentBufferAvgBytesPerDocFactor) + br.Grow(estimateAvgBytesPerDoc * estimateNumResults) + } + + s.results = results + s.chunkMode = chunkMode + s.w = NewCountHashWriter(&br) + + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, + err := s.convert() + if err != nil { + return nil, uint64(0), err + } + + sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, + s.FieldsMap, s.FieldsInv, uint64(len(results)), + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + + if err == nil && s.reset() == nil { + s.lastNumDocs = len(results) + s.lastOutSize = len(br.Bytes()) + interimPool.Put(s) + } + + return sb, uint64(len(br.Bytes())), err +} + +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + +// interim holds temporary working data used while converting from +// analysis results to a zap-encoded segment +type interim struct { + results []*index.AnalysisResult + + chunkMode uint32 + + w *CountHashWriter + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + // Term dictionaries for each field + // field id -> term -> postings list id + 1 + Dicts []map[string]uint64 + + // Terms for each field, where terms are sorted ascending + // field id -> []term + DictKeys [][]string + + // Fields whose IncludeDocValues is true + // field id -> bool + IncludeDocValues []bool + + // postings id -> bitmap of docNums + Postings []*roaring.Bitmap + + // postings id -> freq/norm's, one for each docNum in postings + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm + + // postings id -> locs, one for each freq + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id + + builder *vellum.Builder + builderBuf bytes.Buffer + + metaBuf bytes.Buffer + + tmp0 []byte + tmp1 []byte + + lastNumDocs int + lastOutSize int +} + +func (s *interim) reset() (err error) { + s.results = nil + s.chunkMode = 0 + s.w = nil + s.FieldsMap = nil + s.FieldsInv = nil + for i := range s.Dicts { + s.Dicts[i] = nil + } + s.Dicts = s.Dicts[:0] + for i := range s.DictKeys { + s.DictKeys[i] = s.DictKeys[i][:0] + } + s.DictKeys = s.DictKeys[:0] + for i := range s.IncludeDocValues { + s.IncludeDocValues[i] = false + } + s.IncludeDocValues = s.IncludeDocValues[:0] + for _, idn := range s.Postings { + idn.Clear() + } + s.Postings = s.Postings[:0] + s.FreqNorms = s.FreqNorms[:0] + for i := range s.freqNormsBacking { + s.freqNormsBacking[i] = interimFreqNorm{} + } + s.freqNormsBacking = s.freqNormsBacking[:0] + s.Locs = s.Locs[:0] + for i := range s.locsBacking { + s.locsBacking[i] = interimLoc{} + } + s.locsBacking = s.locsBacking[:0] + s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] + s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] + s.builderBuf.Reset() + if s.builder != nil { + err = s.builder.Reset(&s.builderBuf) + } + s.metaBuf.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + s.lastNumDocs = 0 + s.lastOutSize = 0 + + return err +} + +func (s *interim) grabBuf(size int) []byte { + buf := s.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + s.tmp0 = buf + } + return buf[0:size] +} + +type interimStoredField struct { + vals [][]byte + typs []byte + arrayposs [][]uint64 // array positions +} + +type interimFreqNorm struct { + freq uint64 + norm float32 + numLocs int +} + +type interimLoc struct { + fieldID uint16 + pos uint64 + start uint64 + end uint64 + arrayposs []uint64 +} + +func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { + s.FieldsMap = map[string]uint16{} + + s.getOrDefineField("_id") // _id field is fieldID 0 + + for _, result := range s.results { + for _, field := range result.Document.CompositeFields { + s.getOrDefineField(field.Name()) + } + for _, field := range result.Document.Fields { + s.getOrDefineField(field.Name()) + } + } + + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + if cap(s.IncludeDocValues) >= len(s.FieldsInv) { + s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] + } else { + s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + } + + s.prepareDicts() + + for _, dict := range s.DictKeys { + sort.Strings(dict) + } + + s.processDocuments() + + storedIndexOffset, err := s.writeStoredFields() + if err != nil { + return 0, 0, 0, nil, err + } + + var fdvIndexOffset uint64 + var dictOffsets []uint64 + + if len(s.results) > 0 { + fdvIndexOffset, dictOffsets, err = s.writeDicts() + if err != nil { + return 0, 0, 0, nil, err + } + } else { + dictOffsets = make([]uint64, len(s.FieldsInv)) + } + + fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) + if err != nil { + return 0, 0, 0, nil, err + } + + return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil +} + +func (s *interim) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[fieldName] = fieldIDPlus1 + s.FieldsInv = append(s.FieldsInv, fieldName) + + s.Dicts = append(s.Dicts, make(map[string]uint64)) + + n := len(s.DictKeys) + if n < cap(s.DictKeys) { + s.DictKeys = s.DictKeys[:n+1] + s.DictKeys[n] = s.DictKeys[n][:0] + } else { + s.DictKeys = append(s.DictKeys, []string(nil)) + } + } + + return int(fieldIDPlus1 - 1) +} + +// fill Dicts and DictKeys from analysis results +func (s *interim) prepareDicts() { + var pidNext int + + var totTFs int + var totLocs int + + visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + dict := s.Dicts[fieldID] + dictKeys := s.DictKeys[fieldID] + + for term, tf := range tfs { + pidPlus1, exists := dict[term] + if !exists { + pidNext++ + pidPlus1 = uint64(pidNext) + + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) + + s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) + s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) + } + + pid := pidPlus1 - 1 + + s.numTermsPerPostingsList[pid] += 1 + s.numLocsPerPostingsList[pid] += len(tf.Locations) + + totLocs += len(tf.Locations) + } + + totTFs += len(tfs) + + s.DictKeys[fieldID] = dictKeys + } + + for _, result := range s.results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + visitField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + visitField(fieldID, tf) + } + } + + numPostingsLists := pidNext + + if cap(s.Postings) >= numPostingsLists { + s.Postings = s.Postings[:numPostingsLists] + } else { + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, s.Postings[:cap(s.Postings)]) + for i := 0; i < numPostingsLists; i++ { + if postings[i] == nil { + postings[i] = roaring.New() + } + } + s.Postings = postings + } + + if cap(s.FreqNorms) >= numPostingsLists { + s.FreqNorms = s.FreqNorms[:numPostingsLists] + } else { + s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } + + if cap(s.freqNormsBacking) >= totTFs { + s.freqNormsBacking = s.freqNormsBacking[:totTFs] + } else { + s.freqNormsBacking = make([]interimFreqNorm, totTFs) + } + + freqNormsBacking := s.freqNormsBacking + for pid, numTerms := range s.numTermsPerPostingsList { + s.FreqNorms[pid] = freqNormsBacking[0:0] + freqNormsBacking = freqNormsBacking[numTerms:] + } + + if cap(s.Locs) >= numPostingsLists { + s.Locs = s.Locs[:numPostingsLists] + } else { + s.Locs = make([][]interimLoc, numPostingsLists) + } + + if cap(s.locsBacking) >= totLocs { + s.locsBacking = s.locsBacking[:totLocs] + } else { + s.locsBacking = make([]interimLoc, totLocs) + } + + locsBacking := s.locsBacking + for pid, numLocs := range s.numLocsPerPostingsList { + s.Locs[pid] = locsBacking[0:0] + locsBacking = locsBacking[numLocs:] + } +} + +func (s *interim) processDocuments() { + numFields := len(s.FieldsInv) + reuseFieldLens := make([]int, numFields) + reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) + + for docNum, result := range s.results { + for i := 0; i < numFields; i++ { // clear these for reuse + reuseFieldLens[i] = 0 + reuseFieldTFs[i] = nil + } + + s.processDocument(uint64(docNum), result, + reuseFieldLens, reuseFieldTFs) + } +} + +func (s *interim) processDocument(docNum uint64, + result *index.AnalysisResult, + fieldLens []int, fieldTFs []analysis.TokenFrequencies) { + visitField := func(fieldID uint16, fieldName string, + ln int, tf analysis.TokenFrequencies) { + fieldLens[fieldID] += ln + + existingFreqs := fieldTFs[fieldID] + if existingFreqs != nil { + existingFreqs.MergeAll(fieldName, tf) + } else { + fieldTFs[fieldID] = tf + } + } + + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln, tf := field.Analyze() + visitField(fieldID, field.Name(), ln, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln := result.Length[i] + tf := result.Analyzed[i] + visitField(fieldID, field.Name(), ln, tf) + } + + // now that it's been rolled up into fieldTFs, walk that + for fieldID, tfs := range fieldTFs { + dict := s.Dicts[fieldID] + norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) + + for term, tf := range tfs { + pid := dict[term] - 1 + bs := s.Postings[pid] + bs.Add(uint32(docNum)) + + s.FreqNorms[pid] = append(s.FreqNorms[pid], + interimFreqNorm{ + freq: uint64(tf.Frequency()), + norm: norm, + numLocs: len(tf.Locations), + }) + + if len(tf.Locations) > 0 { + locs := s.Locs[pid] + + for _, loc := range tf.Locations { + var locf = uint16(fieldID) + if loc.Field != "" { + locf = uint16(s.getOrDefineField(loc.Field)) + } + var arrayposs []uint64 + if len(loc.ArrayPositions) > 0 { + arrayposs = loc.ArrayPositions + } + locs = append(locs, interimLoc{ + fieldID: locf, + pos: uint64(loc.Position), + start: uint64(loc.Start), + end: uint64(loc.End), + arrayposs: arrayposs, + }) + } + + s.Locs[pid] = locs + } + } + } +} + +func (s *interim) writeStoredFields() ( + storedIndexOffset uint64, err error) { + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return s.metaBuf.Write(varBuf[:wb]) + } + + data, compressed := s.tmp0[:0], s.tmp1[:0] + defer func() { s.tmp0, s.tmp1 = data, compressed }() + + // keyed by docNum + docStoredOffsets := make([]uint64, len(s.results)) + + // keyed by fieldID, for the current doc in the loop + docStoredFields := map[uint16]interimStoredField{} + + for docNum, result := range s.results { + for fieldID := range docStoredFields { // reset for next doc + delete(docStoredFields, fieldID) + } + + for _, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + + opts := field.Options() + + if opts.IsStored() { + isf := docStoredFields[fieldID] + isf.vals = append(isf.vals, field.Value()) + isf.typs = append(isf.typs, encodeFieldType(field)) + isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) + docStoredFields[fieldID] = isf + } + + if opts.IncludeDocValues() { + s.IncludeDocValues[fieldID] = true + } + + err := ValidateDocFields(field) + if err != nil { + return 0, err + } + } + + var curr int + + s.metaBuf.Reset() + data = data[:0] + + // _id field special case optimizes ExternalID() lookups + idFieldVal := docStoredFields[uint16(0)].vals[0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, err + } + + // handle non-"_id" fields + for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { + isf, exists := docStoredFields[uint16(fieldID)] + if exists { + curr, data, err = persistStoredFieldValues( + fieldID, isf.vals, isf.typs, isf.arrayposs, + curr, metaEncode, data) + if err != nil { + return 0, err + } + } + } + + metaBytes := s.metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + docStoredOffsets[docNum] = uint64(s.w.Count()) + + _, err := writeUvarints(s.w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, err + } + + _, err = s.w.Write(metaBytes) + if err != nil { + return 0, err + } + + _, err = s.w.Write(idFieldVal) + if err != nil { + return 0, err + } + + _, err = s.w.Write(compressed) + if err != nil { + return 0, err + } + } + + storedIndexOffset = uint64(s.w.Count()) + + for _, docStoredOffset := range docStoredOffsets { + err = binary.Write(s.w, binary.BigEndian, docStoredOffset) + if err != nil { + return 0, err + } + } + + return storedIndexOffset, nil +} + +func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { + dictOffsets = make([]uint64, len(s.FieldsInv)) + + fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) + fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) + + buf := s.grabBuf(binary.MaxVarintLen64) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1)) + locEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1)) + + var docTermMap [][]byte + + if s.builder == nil { + s.builder, err = vellum.New(&s.builderBuf, nil) + if err != nil { + return 0, nil, err + } + } + + for fieldID, terms := range s.DictKeys { + if cap(docTermMap) < len(s.results) { + docTermMap = make([][]byte, len(s.results)) + } else { + docTermMap = docTermMap[0:len(s.results)] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + + dict := s.Dicts[fieldID] + + for _, term := range terms { // terms are already sorted + pid := dict[term] - 1 + + postingsBS := s.Postings[pid] + + freqNorms := s.FreqNorms[pid] + freqNormOffset := 0 + + locs := s.Locs[pid] + locOffset := 0 + + chunkSize, err := getChunkSize(s.chunkMode, postingsBS.GetCardinality(), uint64(len(s.results))) + if err != nil { + return 0, nil, err + } + tfEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1)) + locEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1)) + + postingsItr := postingsBS.Iterator() + for postingsItr.HasNext() { + docNum := uint64(postingsItr.Next()) + + freqNorm := freqNorms[freqNormOffset] + + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), + uint64(math.Float32bits(freqNorm.norm))) + if err != nil { + return 0, nil, err + } + + if freqNorm.numLocs > 0 { + numBytesLocs := 0 + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs)), loc.arrayposs) + } + + err = locEncoder.Add(docNum, uint64(numBytesLocs)) + if err != nil { + return 0, nil, err + } + + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs))) + if err != nil { + return 0, nil, err + } + + err = locEncoder.Add(docNum, loc.arrayposs...) + if err != nil { + return 0, nil, err + } + } + + locOffset += freqNorm.numLocs + } + + freqNormOffset++ + + docTermMap[docNum] = append( + append(docTermMap[docNum], term...), + termSeparator) + } + + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) + if err != nil { + return 0, nil, err + } + + if postingsOffset > uint64(0) { + err = s.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return 0, nil, err + } + } + + tfEncoder.Reset() + locEncoder.Reset() + } + + err = s.builder.Close() + if err != nil { + return 0, nil, err + } + + // record where this dictionary starts + dictOffsets[fieldID] = uint64(s.w.Count()) + + vellumData := s.builderBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + + // write this vellum to disk + _, err = s.w.Write(vellumData) + if err != nil { + return 0, nil, err + } + + // reset vellum for reuse + s.builderBuf.Reset() + + err = s.builder.Reset(&s.builderBuf) + if err != nil { + return 0, nil, err + } + + // write the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return 0, nil, err + } + fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(s.results)-1), s.w, false) + if s.IncludeDocValues[fieldID] { + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return 0, nil, err + } + } + } + err = fdvEncoder.Close() + if err != nil { + return 0, nil, err + } + + fdvOffsetsStart[fieldID] = uint64(s.w.Count()) + + _, err = fdvEncoder.Write() + if err != nil { + return 0, nil, err + } + + fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) + + fdvEncoder.Reset() + } else { + fdvOffsetsStart[fieldID] = fieldNotUninverted + fdvOffsetsEnd[fieldID] = fieldNotUninverted + } + } + + fdvIndexOffset = uint64(s.w.Count()) + + for i := 0; i < len(fdvOffsetsStart); i++ { + n := binary.PutUvarint(buf, fdvOffsetsStart[i]) + _, err := s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + } + + return fdvIndexOffset, dictOffsets, nil +} + +func encodeFieldType(f document.Field) byte { + fieldType := byte('x') + switch f.(type) { + case *document.TextField: + fieldType = 't' + case *document.NumericField: + fieldType = 'n' + case *document.DateTimeField: + fieldType = 'd' + case *document.BooleanField: + fieldType = 'b' + case *document.GeoPointField: + fieldType = 'g' + case *document.CompositeField: + fieldType = 'c' + } + return fieldType +} + +// returns the total # of bytes needed to encode the given uint64's +// into binary.PutUVarint() encoding +func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { + n = numUvarintBytes(a) + n += numUvarintBytes(b) + n += numUvarintBytes(c) + n += numUvarintBytes(d) + n += numUvarintBytes(e) + for _, v := range more { + n += numUvarintBytes(v) + } + return n +} + +// returns # of bytes needed to encode x in binary.PutUvarint() encoding +func numUvarintBytes(x uint64) (n int) { + for x >= 0x80 { + x >>= 7 + n++ + } + return n + 1 +} diff --git a/vendor/github.com/blevesearch/zap/v13/plugin.go b/vendor/github.com/blevesearch/zap/v13/plugin.go new file mode 100644 index 000000000..38a0638d4 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/plugin.go @@ -0,0 +1,37 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// ZapPlugin implements the Plugin interface of +// the blevesearch/bleve/index/scorch/segment pkg +type ZapPlugin struct{} + +func (*ZapPlugin) Type() string { + return Type +} + +func (*ZapPlugin) Version() uint32 { + return Version +} + +// Plugin returns an instance segment.Plugin for use +// by the Scorch indexing scheme +func Plugin() segment.Plugin { + return &ZapPlugin{} +} diff --git a/vendor/github.com/blevesearch/zap/v13/posting.go b/vendor/github.com/blevesearch/zap/v13/posting.go new file mode 100644 index 000000000..3a6ee5483 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/posting.go @@ -0,0 +1,798 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "math" + "reflect" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + +// FST or vellum value (uint64) encoding is determined by the top two +// highest-order or most significant bits... +// +// encoding : MSB +// name : 63 62 61...to...bit #0 (LSB) +// ----------+---+---+--------------------------------------------------- +// general : 0 | 0 | 62-bits of postingsOffset. +// ~ : 0 | 1 | reserved for future. +// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. +// ~ : 1 | 1 | reserved for future. +// +// Encoding "general" is able to handle all cases, where the +// postingsOffset points to more information about the postings for +// the term. +// +// Encoding "1-hit" is used to optimize a commonly seen case when a +// term has only a single hit. For example, a term in the _id field +// will have only 1 hit. The "1-hit" encoding is used for a term +// in a field when... +// +// - term vector info is disabled for that field; +// - and, the term appears in only a single doc for that field; +// - and, the term's freq is exactly 1 in that single doc for that field; +// - and, the docNum must fit into 31-bits; +// +// Otherwise, the "general" encoding is used instead. +// +// In the "1-hit" encoding, the field in that single doc may have +// other terms, which is supported in the "1-hit" encoding by the +// positive float31 norm. + +const FSTValEncodingMask = uint64(0xc000000000000000) +const FSTValEncodingGeneral = uint64(0x0000000000000000) +const FSTValEncoding1Hit = uint64(0x8000000000000000) + +func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { + return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) +} + +func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { + return (mask31Bits & v), (mask31Bits & (v >> 31)) +} + +const mask31Bits = uint64(0x000000007fffffff) + +func under32Bits(x uint64) bool { + return x <= mask31Bits +} + +const DocNum1HitFinished = math.MaxUint64 + +var NormBits1Hit = uint64(math.Float32bits(float32(1))) + +// PostingsList is an in-memory representation of a postings list +type PostingsList struct { + sb *SegmentBase + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + + // when normBits1Hit != 0, then this postings list came from a + // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply + docNum1Hit uint64 + normBits1Hit uint64 +} + +// represents an immutable, empty postings list +var emptyPostingsList = &PostingsList{} + +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { + if p.normBits1Hit != 0 { + receiver.Add(uint32(p.docNum1Hit)) + return + } + + if p.postings != nil { + receiver.Or(p.postings) + } +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, + prealloc segment.PostingsIterator) segment.PostingsIterator { + if p.normBits1Hit == 0 && p.postings == nil { + return emptyPostingsIterator + } + + var preallocPI *PostingsIterator + pi, ok := prealloc.(*PostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyPostingsIterator { + preallocPI = nil + } + + return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) +} + +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, + rv *PostingsIterator) *PostingsIterator { + if rv == nil { + rv = &PostingsIterator{} + } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.reset() + } + + locReader := rv.locReader + if locReader != nil { + locReader.reset() + } + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf + } + + rv.postings = p + rv.includeFreqNorm = includeFreq || includeNorm || includeLocs + rv.includeLocs = includeLocs + + if p.normBits1Hit != 0 { + // "1-hit" encoding + rv.docNum1Hit = p.docNum1Hit + rv.normBits1Hit = p.normBits1Hit + + if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { + rv.docNum1Hit = DocNum1HitFinished + } + + return rv + } + + // "general" encoding, check if empty + if p.postings == nil { + return rv + } + + // initialize freq chunk reader + if rv.includeFreqNorm { + rv.freqNormReader = newChunkedIntDecoder(p.sb.mem, p.freqOffset) + } + + // initialize the loc chunk reader + if rv.includeLocs { + rv.locReader = newChunkedIntDecoder(p.sb.mem, p.locOffset) + } + + rv.all = p.postings.Iterator() + if p.except != nil { + rv.ActualBM = roaring.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = p.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. + } + + return rv +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var n, e uint64 + if p.normBits1Hit != 0 { + n = 1 + if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) { + e = 1 + } + } else if p.postings != nil { + n = p.postings.GetCardinality() + if p.except != nil { + e = p.postings.AndCardinality(p.except) + } + } + return n - e +} + +func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { + rv.postingsOffset = postingsOffset + + // handle "1-hit" encoding special case + if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { + return rv.init1Hit(postingsOffset) + } + + // read the location of the freq/norm details + var n uint64 + var read int + + rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + + rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var postingsLen uint64 + postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] + + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } + _, err := rv.postings.FromBuffer(roaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + return nil +} + +func (rv *PostingsList) init1Hit(fstVal uint64) error { + docNum, normBits := FSTValDecode1Hit(fstVal) + + rv.docNum1Hit = docNum + rv.normBits1Hit = normBits + + return nil +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntPeekable + Actual roaring.IntPeekable + ActualBM *roaring.Bitmap + + currChunk uint32 + freqNormReader *chunkedIntDecoder + locReader *chunkedIntDecoder + + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls + nextSegmentLocs []segment.Location // reused across Next() calls + + docNum1Hit uint64 + normBits1Hit uint64 + + buf []byte + + includeFreqNorm bool + includeLocs bool +} + +var emptyPostingsIterator = &PostingsIterator{} + +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + i.next.Size() + // account for freqNormReader, locReader if we start using this. + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + if i.includeFreqNorm { + err := i.freqNormReader.loadChunk(chunk) + if err != nil { + return err + } + } + + if i.includeLocs { + err := i.locReader.loadChunk(chunk) + if err != nil { + return err + } + } + + i.currChunk = uint32(chunk) + return nil +} + +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { + if i.normBits1Hit != 0 { + return 1, i.normBits1Hit, false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) + } + + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + + normBits, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) + } + + return freq, normBits, hasLocs, nil +} + +func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) { + if i.normBits1Hit != 0 { + return false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return false, fmt.Errorf("error reading freqHasLocs: %v", err) + } + + i.freqNormReader.SkipUvarint() // Skip normBits. + + return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs. +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs +} + +// readLocation processes all the integers on the stream representing a single +// location. +func (i *PostingsIterator) readLocation(l *Location) error { + // read off field + fieldID, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } + // read off pos + pos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location pos: %v", err) + } + // read off start + start, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location start: %v", err) + } + // read off end + end, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } + // read off num array pos + numArrayPos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location num array pos: %v", err) + } + + l.field = i.postings.sb.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + + if cap(l.ap) < int(numArrayPos) { + l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:int(numArrayPos)] + } + + // read off array positions + for k := 0; k < int(numArrayPos); k++ { + ap, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading array position: %v", err) + } + + l.ap[k] = ap + } + + return nil +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() (segment.Posting, error) { + return i.nextAtOrAfter(0) +} + +// Advance returns the posting at the specified docNum or it is not present +// the next posting, or if the end is reached, nil +func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { + return i.nextAtOrAfter(docNum) +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { + docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err + } + + i.next = Posting{} // clear the struct + rv := &i.next + rv.docNum = docNum + + if !i.includeFreqNorm { + return rv, nil + } + + var normBits uint64 + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return nil, err + } + + rv.norm = math.Float32frombits(uint32(normBits)) + + if i.includeLocs && hasLocs { + // prepare locations into reused slices, where we assume + // rv.freq >= "number of locs", since in a composite field, + // some component fields might have their IncludeTermVector + // flags disabled while other component fields are enabled + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] + } else { + i.nextLocs = make([]Location, rv.freq, rv.freq*2) + } + if cap(i.nextSegmentLocs) < int(rv.freq) { + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) + } + rv.locs = i.nextSegmentLocs[:0] + + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + j := 0 + startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { + err := i.readLocation(&i.nextLocs[j]) + if err != nil { + return nil, err + } + rv.locs = append(rv.locs, &i.nextLocs[j]) + j++ + } + } + + return rv, nil +} + +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if i.normBits1Hit != 0 { + if i.docNum1Hit == DocNum1HitFinished { + return 0, false, nil + } + if i.docNum1Hit < atOrAfter { + // advanced past our 1-hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return 0, false, nil + } + docNum := i.docNum1Hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return docNum, true, nil + } + + if i.Actual == nil || !i.Actual.HasNext() { + return 0, false, nil + } + + if i.postings == nil || i.postings.postings == i.ActualBM { + return i.nextDocNumAtOrAfterClean(atOrAfter) + } + + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { + // couldn't find anything + return 0, false, nil + } + + n := i.Actual.Next() + allN := i.all.Next() + + chunkSize, err := getChunkSize(i.postings.sb.chunkMode, i.postings.postings.GetCardinality(), i.postings.sb.numDocs) + if err != nil { + return 0, false, err + } + nChunk := n / uint32(chunkSize) + + // when allN becomes >= to here, then allN is in the same chunk as nChunk. + allNReachesNChunk := nChunk * uint32(chunkSize) + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do + for allN != n { + // we've reached same chunk, so move the freq/norm/loc decoders forward + if i.includeFreqNorm && allN >= allNReachesNChunk { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, err + } + } + + allN = i.all.Next() + } + + if i.includeFreqNorm && (i.currChunk != nChunk || i.freqNormReader.isNil()) { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +// optimization when the postings list is "clean" (e.g., no updates & +// no deletions) where the all bitmap is the same as the actual bitmap +func (i *PostingsIterator) nextDocNumAtOrAfterClean( + atOrAfter uint64) (uint64, bool, error) { + + if !i.includeFreqNorm { + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { + return 0, false, nil // couldn't find anything + } + + return uint64(i.Actual.Next()), true, nil + } + + chunkSize, err := getChunkSize(i.postings.sb.chunkMode, i.postings.postings.GetCardinality(), i.postings.sb.numDocs) + if err != nil { + return 0, false, err + } + + // freq-norm's needed, so maintain freq-norm chunk reader + sameChunkNexts := 0 // # of times we called Next() in the same chunk + n := i.Actual.Next() + nChunk := n / uint32(chunkSize) + + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + + nChunkPrev := nChunk + nChunk = n / uint32(chunkSize) + + if nChunk != nChunkPrev { + sameChunkNexts = 0 + } else { + sameChunkNexts += 1 + } + } + + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) + } + } + + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +func (i *PostingsIterator) currChunkNext(nChunk uint32) error { + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + hasLocs, err := i.skipFreqNormReadHasLocs() + if err != nil { + return err + } + + if i.includeLocs && hasLocs { + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + i.locReader.SkipBytes(int(numLocsBytes)) + } + + return nil +} + +// DocNum1Hit returns the docNum and true if this is "1-hit" optimized +// and the docNum is available. +func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { + if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { + return p.docNum1Hit, true + } + return 0, false +} + +// ActualBitmap returns the underlying actual bitmap +// which can be used up the stack for optimizations +func (p *PostingsIterator) ActualBitmap() *roaring.Bitmap { + return p.ActualBM +} + +// ReplaceActual replaces the ActualBM with the provided +// bitmap +func (p *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) { + p.ActualBM = abm + p.Actual = abm.Iterator() +} + +// PostingsIteratorFromBitmap constructs a PostingsIterator given an +// "actual" bitmap. +func PostingsIteratorFromBitmap(bm *roaring.Bitmap, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + ActualBM: bm, + Actual: bm.Iterator(), + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// PostingsIteratorFrom1Hit constructs a PostingsIterator given a +// 1-hit docNum. +func PostingsIteratorFrom1Hit(docNum1Hit uint64, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + docNum1Hit: docNum1Hit, + normBits1Hit: NormBits1Hit, + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// Posting is a single entry in a postings list +type Posting struct { + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location +} + +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequencies of occurrence of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.freq +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(p.norm) +} + +// Locations returns the location information for each occurrence +func (p *Posting) Locations() []segment.Location { + return p.locs +} + +// Location represents the location of a single occurrence +type Location struct { + field string + pos uint64 + start uint64 + end uint64 + ap []uint64 +} + +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*size.SizeOfUint64 +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.field +} + +// Start returns the start byte offset of this occurrence +func (l *Location) Start() uint64 { + return l.start +} + +// End returns the end byte offset of this occurrence +func (l *Location) End() uint64 { + return l.end +} + +// Pos returns the 1-based phrase position of this occurrence +func (l *Location) Pos() uint64 { + return l.pos +} + +// ArrayPositions returns the array position vector associated with this occurrence +func (l *Location) ArrayPositions() []uint64 { + return l.ap +} diff --git a/vendor/github.com/blevesearch/zap/v13/read.go b/vendor/github.com/blevesearch/zap/v13/read.go new file mode 100644 index 000000000..e47d4c6ab --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/read.go @@ -0,0 +1,43 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import "encoding/binary" + +func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { + _, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum) + + meta := s.mem[storedOffset+n : storedOffset+n+metaLen] + data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen] + + return meta, data +} + +func (s *SegmentBase) getDocStoredOffsets(docNum uint64) ( + uint64, uint64, uint64, uint64, uint64) { + indexOffset := s.storedIndexOffset + (8 * docNum) + + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + + var n uint64 + + metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64]) + n += uint64(read) + + dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + return indexOffset, storedOffset, n, metaLen, dataLen +} diff --git a/vendor/github.com/blevesearch/zap/v13/segment.go b/vendor/github.com/blevesearch/zap/v13/segment.go new file mode 100644 index 000000000..e8b1f067f --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/segment.go @@ -0,0 +1,572 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "os" + "sync" + "unsafe" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" + "github.com/couchbase/vellum" + mmap "github.com/blevesearch/mmap-go" + "github.com/golang/snappy" +) + +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) +} + +// Open returns a zap impl of a segment +func (*ZapPlugin) Open(path string) (segment.Segment, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + mm, err := mmap.Map(f, mmap.RDONLY, 0) + if err != nil { + // mmap failed, try to close the file + _ = f.Close() + return nil, err + } + + rv := &Segment{ + SegmentBase: SegmentBase{ + mem: mm[0 : len(mm)-FooterSize], + fieldsMap: make(map[string]uint16), + fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), + }, + f: f, + mm: mm, + path: path, + refs: 1, + } + rv.SegmentBase.updateSize() + + err = rv.loadConfig() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadFields() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadDvReaders() + if err != nil { + _ = rv.Close() + return nil, err + } + + return rv, nil +} + +// SegmentBase is a memory only, read-only implementation of the +// segment.Segment interface, using zap's data representation. +type SegmentBase struct { + mem []byte + memCRC uint32 + chunkMode uint32 + fieldsMap map[string]uint16 // fieldName -> fieldID+1 + fieldsInv []string // fieldID -> fieldName + numDocs uint64 + storedIndexOffset uint64 + fieldsIndexOffset uint64 + docValueOffset uint64 + dictLocs []uint64 + fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field + fieldDvNames []string // field names cached in fieldDvReaders + size uint64 + + m sync.Mutex + fieldFSTs map[uint16]*vellum.FST +} + +func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { + sizeInBytes := reflectStaticSizeSegmentBase + + cap(sb.mem) + + // fieldsMap + for k := range sb.fieldsMap { + sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + size.SizeOfString + } + sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 + + // fieldDvReaders + for _, v := range sb.fieldDvReaders { + sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + + sb.size = uint64(sizeInBytes) +} + +func (sb *SegmentBase) AddRef() {} +func (sb *SegmentBase) DecRef() (err error) { return nil } +func (sb *SegmentBase) Close() (err error) { return nil } + +// Segment implements a persisted segment.Segment interface, by +// embedding an mmap()'ed SegmentBase. +type Segment struct { + SegmentBase + + f *os.File + mm mmap.MMap + path string + version uint32 + crc uint32 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (s *Segment) Size() int { + // 8 /* size of file pointer */ + // 4 /* size of version -> uint32 */ + // 4 /* size of crc -> uint32 */ + sizeOfUints := 16 + + sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints + + // mutex, refs -> int64 + sizeInBytes += 16 + + // do not include the mmap'ed part + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) +} + +func (s *Segment) AddRef() { + s.m.Lock() + s.refs++ + s.m.Unlock() +} + +func (s *Segment) DecRef() (err error) { + s.m.Lock() + s.refs-- + if s.refs == 0 { + err = s.closeActual() + } + s.m.Unlock() + return err +} + +func (s *Segment) loadConfig() error { + crcOffset := len(s.mm) - 4 + s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + + verOffset := crcOffset - 4 + s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) + if s.version != Version { + return fmt.Errorf("unsupported version %d", s.version) + } + + chunkOffset := verOffset - 4 + s.chunkMode = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) + + docValueOffset := chunkOffset - 8 + s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) + + fieldsIndexOffset := docValueOffset - 8 + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) + + storedIndexOffset := fieldsIndexOffset - 8 + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) + + numDocsOffset := storedIndexOffset - 8 + s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) + return nil +} + +func (s *SegmentBase) loadFields() error { + // NOTE for now we assume the fields index immediately precedes + // the footer, and if this changes, need to adjust accordingly (or + // store explicit length), where s.mem was sliced from s.mm in Open(). + fieldsIndexEnd := uint64(len(s.mem)) + + // iterate through fields index + var fieldID uint64 + for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) + + dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) + n := uint64(read) + s.dictLocs = append(s.dictLocs, dictLoc) + + var nameLen uint64 + nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(s.mem[addr+n : addr+n+nameLen]) + s.fieldsInv = append(s.fieldsInv, name) + s.fieldsMap[name] = uint16(fieldID + 1) + + fieldID++ + } + return nil +} + +// Dictionary returns the term dictionary for the specified field +func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { + dict, err := s.dictionary(field) + if err == nil && dict == nil { + return &segment.EmptyDictionary{}, nil + } + return dict, err +} + +func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 > 0 { + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } + + dictStart := sb.dictLocs[rv.fieldID] + if dictStart > 0 { + var ok bool + sb.m.Lock() + if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok { + // read the length of the vellum data + vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + rv.fst, err = vellum.Load(fstBytes) + if err != nil { + sb.m.Unlock() + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + + sb.fieldFSTs[rv.fieldID] = rv.fst + } + + sb.m.Unlock() + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } + + } + } + + return rv, nil +} + +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + return reuse + }, +} + +// VisitDocument invokes the DocFieldValueVistor for each stored field +// for the specified doc number +func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitDocument(vdc, num, visitor) +} + +func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, + visitor segment.DocumentFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < s.numDocs { + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + idFieldVal := compressed[:idFieldValLen] + + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + if err != nil { + return err + } + + for keepGoing { + field, err := binary.ReadUvarint(&vdc.reader) + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + offset, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + l, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + numap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] + for i := 0; i < int(numap); i++ { + ap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + arrayPos[i] = ap + } + } + + value := uncompressed[offset : offset+l] + keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) + } + + vdc.buf = uncompressed + } + return nil +} + +// DocID returns the value of the _id field for the given docNum +func (s *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= s.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + +// Count returns the number of documents in this segment. +func (s *SegmentBase) Count() uint64 { + return s.numDocs +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(s.fieldsMap) > 0 { + idDict, err := s.dictionary("_id") + if err != nil { + return nil, err + } + + postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + filteredIds := make([]string, 0, len(ids)) + for _, id := range ids { + if id <= sMaxStr { + filteredIds = append(filteredIds, id) + } + } + + for _, id := range filteredIds { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } + } + + return rv, nil +} + +// Fields returns the field names used in this segment +func (s *SegmentBase) Fields() []string { + return s.fieldsInv +} + +// Path returns the path of this segment on disk +func (s *Segment) Path() string { + return s.path +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() (err error) { + return s.DecRef() +} + +func (s *Segment) closeActual() (err error) { + if s.mm != nil { + err = s.mm.Unmap() + } + // try to close file even if unmap failed + if s.f != nil { + err2 := s.f.Close() + if err == nil { + // try to return first error + err = err2 + } + } + return +} + +// some helpers i started adding for the command-line utility + +// Data returns the underlying mmaped data slice +func (s *Segment) Data() []byte { + return s.mm +} + +// CRC returns the CRC value stored in the file footer +func (s *Segment) CRC() uint32 { + return s.crc +} + +// Version returns the file version in the file footer +func (s *Segment) Version() uint32 { + return s.version +} + +// ChunkFactor returns the chunk factor in the file footer +func (s *Segment) ChunkMode() uint32 { + return s.chunkMode +} + +// FieldsIndexOffset returns the fields index offset in the file footer +func (s *Segment) FieldsIndexOffset() uint64 { + return s.fieldsIndexOffset +} + +// StoredIndexOffset returns the stored value index offset in the file footer +func (s *Segment) StoredIndexOffset() uint64 { + return s.storedIndexOffset +} + +// DocValueOffset returns the docValue offset in the file footer +func (s *Segment) DocValueOffset() uint64 { + return s.docValueOffset +} + +// NumDocs returns the number of documents in the file footer +func (s *Segment) NumDocs() uint64 { + return s.numDocs +} + +// DictAddr is a helper function to compute the file offset where the +// dictionary is stored for the specified field. +func (s *Segment) DictAddr(field string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[field] + if !ok { + return 0, fmt.Errorf("no such field '%s'", field) + } + + return s.dictLocs[fieldIDPlus1-1], nil +} + +func (s *SegmentBase) loadDvReaders() error { + if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 { + return nil + } + + var read uint64 + for fieldID, field := range s.fieldsInv { + var fieldLocStart, fieldLocEnd uint64 + var n int + fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) + } + read += uint64(n) + fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + + fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + if err != nil { + return err + } + if fieldDvReader != nil { + s.fieldDvReaders[uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, field) + } + } + + return nil +} diff --git a/vendor/github.com/blevesearch/zap/v13/write.go b/vendor/github.com/blevesearch/zap/v13/write.go new file mode 100644 index 000000000..77aefdbfc --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/write.go @@ -0,0 +1,145 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "io" + + "github.com/RoaringBitmap/roaring" +) + +// writes out the length of the roaring bitmap in bytes as varint +// then writes out the roaring bitmap itself +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, + reuseBufVarint []byte) (int, error) { + buf, err := r.ToBytes() + if err != nil { + return 0, err + } + + var tw int + + // write out the length + n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) + nw, err := w.Write(reuseBufVarint[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the roaring bytes + nw, err = w.Write(buf) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { + var rv uint64 + var fieldsOffsets []uint64 + + for fieldID, fieldName := range fieldsInv { + // record start of this field + fieldsOffsets = append(fieldsOffsets, uint64(w.Count())) + + // write out the dict location and field name length + _, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName))) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + } + + // now write out the fields index + rv = uint64(w.Count()) + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} + +// FooterSize is the size of the footer record in bytes +// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset +const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 + +func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + chunkMode uint32, crcBeforeFooter uint32, writerIn io.Writer) error { + w := NewCountHashWriter(writerIn) + w.crc = crcBeforeFooter + + // write out the number of docs + err := binary.Write(w, binary.BigEndian, numDocs) + if err != nil { + return err + } + // write out the stored field index location: + err = binary.Write(w, binary.BigEndian, storedIndexOffset) + if err != nil { + return err + } + // write out the field index location + err = binary.Write(w, binary.BigEndian, fieldsIndexOffset) + if err != nil { + return err + } + // write out the fieldDocValue location + err = binary.Write(w, binary.BigEndian, docValueOffset) + if err != nil { + return err + } + // write out 32-bit chunk factor + err = binary.Write(w, binary.BigEndian, chunkMode) + if err != nil { + return err + } + // write out 32-bit version + err = binary.Write(w, binary.BigEndian, Version) + if err != nil { + return err + } + // write out CRC-32 of everything upto but not including this CRC + err = binary.Write(w, binary.BigEndian, w.crc) + if err != nil { + return err + } + return nil +} + +func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) { + buf := make([]byte, binary.MaxVarintLen64) + for _, val := range vals { + n := binary.PutUvarint(buf, val) + var nw int + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + return tw, err +} diff --git a/vendor/github.com/blevesearch/zap/v13/zap.md b/vendor/github.com/blevesearch/zap/v13/zap.md new file mode 100644 index 000000000..d74dc548b --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v13/zap.md @@ -0,0 +1,177 @@ +# ZAP File Format + +## Legend + +### Sections + + |========| + | | section + |========| + +### Fixed-size fields + + |--------| |----| |--| |-| + | | uint64 | | uint32 | | uint16 | | uint8 + |--------| |----| |--| |-| + +### Varints + + |~~~~~~~~| + | | varint(up to uint64) + |~~~~~~~~| + +### Arbitrary-length fields + + |--------...---| + | | arbitrary-length field (string, vellum, roaring bitmap) + |--------...---| + +### Chunked data + + [--------] + [ ] + [--------] + +## Overview + +Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. + + |==================================================| + | Stored Fields | + |==================================================| + |-----> | Stored Fields Index | + | |==================================================| + | | Dictionaries + Postings + DocValues | + | |==================================================| + | |---> | DocValues Index | + | | |==================================================| + | | | Fields | + | | |==================================================| + | | |-> | Fields Index | + | | | |========|========|========|========|====|====|====| + | | | | D# | SF | F | FDV | CF | V | CC | (Footer) + | | | |========|====|===|====|===|====|===|====|====|====| + | | | | | | + |-+-+-----------------| | | + | |--------------------------| | + |-------------------------------------| + + D#. Number of Docs. + SF. Stored Fields Index Offset. + F. Field Index Offset. + FDV. Field DocValue Offset. + CF. Chunk Factor. + V. Version. + CC. CRC32. + +## Stored Fields + +Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. + + 0 [SF] [SF + D# * 8] + | Stored Fields | Stored Fields Index | + |================================|==================================| + | | | + | |--------------------| ||--------|--------|. . .|--------|| + | |-> | Stored Fields Data | || 0 | 1 | | D# - 1 || + | | |--------------------| ||--------|----|---|. . .|--------|| + | | | | | + |===|============================|==============|===================| + | | + |-------------------------------------------| + +Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data. + + Stored Fields Data + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + | MDS | CDS | MD | CD | + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + + MDS. Metadata size. + CDS. Compressed data size. + MD. Metadata. + CD. Snappy-compressed data. + +## Fields + +Fields Index section located between addresses `F` and `len(file) - len(footer)` and consist of `uint64` values (`F1`, `F2`, ...) which are offsets to records in Fields section. We have `F# = (len(file) - len(footer) - F) / sizeof(uint64)` fields. + + + (...) [F] [F + F#] + | Fields | Fields Index. | + |================================|================================| + | | | + | |~~~~~~~~|~~~~~~~~|---...---|||--------|--------|...|--------|| + ||->| Dict | Length | Name ||| 0 | 1 | | F# - 1 || + || |~~~~~~~~|~~~~~~~~|---...---|||--------|----|---|...|--------|| + || | | | + ||===============================|==============|=================| + | | + |----------------------------------------------| + + +## Dictionaries + Postings + +Each of fields has its own dictionary, encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. + + |================================================================|- Dictionaries + + | | Postings + + | | DocValues + | Freq/Norm (chunked) | + | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | |->[ Freq | Norm (float32 under varint) ] | + | | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | | | + | |------------------------------------------------------------| | + | Location Details (chunked) | | + | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | + | |->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | | + | | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | + | | | | + | |----------------------| | | + | Postings List | | | + | |~~~~~~~~|~~~~~|~~|~~~~~~~~|-----------...--| | | + | |->| F/N | LD | Length | ROARING BITMAP | | | + | | |~~~~~|~~|~~~~~~~~|~~~~~~~~|-----------...--| | | + | | |----------------------------------------------| | + | |--------------------------------------| | + | Dictionary | | + | |~~~~~~~~|--------------------------|-...-| | + | |->| Length | VELLUM DATA : (TERM -> OFFSET) | | + | | |~~~~~~~~|----------------------------...-| | + | | | + |======|=========================================================|- DocValues Index + | | | + |======|=========================================================|- Fields + | | | + | |~~~~|~~~|~~~~~~~~|---...---| | + | | Dict | Length | Name | | + | |~~~~~~~~|~~~~~~~~|---...---| | + | | + |================================================================| + +## DocValues + +DocValues Index is `F#` pairs of varints, one pair per field. Each pair of varints indicates start and end point of DocValues slice. + + |================================================================| + | |------...--| | + | |->| DocValues |<-| | + | | |------...--| | | + |==|=================|===========================================|- DocValues Index + ||~|~~~~~~~~~|~~~~~~~|~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| + || DV1 START | DV1 STOP | . . . . . | DV(F#) START | DV(F#) END || + ||~~~~~~~~~~~|~~~~~~~~~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| + |================================================================| + +DocValues is chunked Snappy-compressed values for each document and field. + + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + [ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ] + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + +Last 16 bytes are description of chunks. + + |~~~~~~~~~~~~...~|----------------|----------------| + | Chunk Sizes | Chunk Size Arr | Chunk# | + |~~~~~~~~~~~~...~|----------------|----------------| diff --git a/vendor/github.com/blevesearch/zap/v14/.gitignore b/vendor/github.com/blevesearch/zap/v14/.gitignore new file mode 100644 index 000000000..46d1cfad5 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/.gitignore @@ -0,0 +1,12 @@ +#* +*.sublime-* +*~ +.#* +.project +.settings +**/.idea/ +**/*.iml +.DS_Store +/cmd/zap/zap +*.test +tags diff --git a/vendor/github.com/blevesearch/zap/v14/LICENSE b/vendor/github.com/blevesearch/zap/v14/LICENSE new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/vendor/github.com/blevesearch/zap/v14/README.md b/vendor/github.com/blevesearch/zap/v14/README.md new file mode 100644 index 000000000..0facb669f --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/README.md @@ -0,0 +1,158 @@ +# zap file format + +Advanced ZAP File Format Documentation is [here](zap.md). + +The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. + +Current usage: + +- mmap the entire file +- crc-32 bytes and version are in fixed position at end of the file +- reading remainder of footer could be version specific +- remainder of footer gives us: + - 3 important offsets (docValue , fields index and stored data index) + - 2 important values (number of docs and chunk factor) +- field data is processed once and memoized onto the heap so that we never have to go back to disk for it +- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. +- access to all other indexed data follows the following pattern: + - first know the field name -> convert to id + - next navigate to term dictionary for that field + - some operations stop here and do dictionary ops + - next use dictionary to navigate to posting list for a specific term + - walk posting list + - if necessary, walk posting details as we go + - if location info is desired, consult location bitmap to see if it is there + +## stored fields section + +- for each document + - preparation phase: + - produce a slice of metadata bytes and data bytes + - produce these slices in field id order + - field value is appended to the data slice + - metadata slice is varint encoded with the following values for each field value + - field id (uint16) + - field type (byte) + - field value start offset in uncompressed data slice (uint64) + - field value length (uint64) + - field number of array positions (uint64) + - one additional value for each array position (uint64) + - compress the data slice using snappy + - file writing phase: + - remember the start offset for this document + - write out meta data length (varint uint64) + - write out compressed data length (varint uint64) + - write out the metadata bytes + - write out the compressed data bytes + +## stored fields idx + +- for each document + - write start offset (remembered from previous section) of stored data (big endian uint64) + +With this index and a known document number, we have direct access to all the stored field data. + +## posting details (freq/norm) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode term frequency (uint64) + - encode norm factor (float32) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## posting details (location) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode field (uint16) + - encode field pos (uint64) + - encode field start (uint64) + - encode field end (uint64) + - encode number of array positions to follow (uint64) + - encode each array position (each uint64) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## postings list section + +- for each posting list + - preparation phase: + - encode roaring bitmap posting list to bytes (so we know the length) + - file writing phase: + - remember the start position for this posting list + - write freq/norm details offset (remembered from previous, as varint uint64) + - write location details offset (remembered from previous, as varint uint64) + - write length of encoded roaring bitmap + - write the serialized roaring bitmap data + +## dictionary + +- for each field + - preparation phase: + - encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous) + - file writing phase: + - remember the start position of this persistDictionary + - write length of vellum data (varint uint64) + - write out vellum data + +## fields section + +- for each field + - file writing phase: + - remember start offset for each field + - write dictionary address (remembered from previous) (varint uint64) + - write length of field name (varint uint64) + - write field name bytes + +## fields idx + +- for each field + - file writing phase: + - write big endian uint64 of start offset for each field + +NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. + +## fields DocValue + +- for each field + - preparation phase: + - produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data + - produce a slice remembering the length of each chunk + - file writing phase: + - remember the start position of this first field DocValue offset in the footer + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any +read operation leverage that meta information to extract the document specific data from the file. + +## footer + +- file writing phase + - write number of docs (big endian uint64) + - write stored field index location (big endian uint64) + - write field index location (big endian uint64) + - write field docValue location (big endian uint64) + - write out chunk factor (big endian uint32) + - write out version (big endian uint32) + - write out file CRC of everything preceding this (big endian uint32) diff --git a/vendor/github.com/blevesearch/zap/v14/build.go b/vendor/github.com/blevesearch/zap/v14/build.go new file mode 100644 index 000000000..7a8dce0dc --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/build.go @@ -0,0 +1,156 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "math" + "os" + + "github.com/couchbase/vellum" +) + +const Version uint32 = 14 + +const Type string = "zap" + +const fieldNotUninverted = math.MaxUint64 + +func (sb *SegmentBase) Persist(path string) error { + return PersistSegmentBase(sb, path) +} + +// PersistSegmentBase persists SegmentBase in the zap file format. +func PersistSegmentBase(sb *SegmentBase, path string) error { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + br := bufio.NewWriter(f) + + _, err = br.Write(sb.mem) + if err != nil { + cleanup() + return err + } + + err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset, + sb.chunkMode, sb.memCRC, br) + if err != nil { + cleanup() + return err + } + + err = br.Flush() + if err != nil { + cleanup() + return err + } + + err = f.Sync() + if err != nil { + cleanup() + return err + } + + err = f.Close() + if err != nil { + cleanup() + return err + } + + return nil +} + +func persistStoredFieldValues(fieldID int, + storedFieldValues [][]byte, stf []byte, spf [][]uint64, + curr int, metaEncode varintEncoder, data []byte) ( + int, []byte, error) { + for i := 0; i < len(storedFieldValues); i++ { + // encode field + _, err := metaEncode(uint64(fieldID)) + if err != nil { + return 0, nil, err + } + // encode type + _, err = metaEncode(uint64(stf[i])) + if err != nil { + return 0, nil, err + } + // encode start offset + _, err = metaEncode(uint64(curr)) + if err != nil { + return 0, nil, err + } + // end len + _, err = metaEncode(uint64(len(storedFieldValues[i]))) + if err != nil { + return 0, nil, err + } + // encode number of array pos + _, err = metaEncode(uint64(len(spf[i]))) + if err != nil { + return 0, nil, err + } + // encode all array positions + for _, pos := range spf[i] { + _, err = metaEncode(pos) + if err != nil { + return 0, nil, err + } + } + + data = append(data, storedFieldValues[i]...) + curr += len(storedFieldValues[i]) + } + + return curr, data, nil +} + +func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, + fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, + storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, + dictLocs []uint64) (*SegmentBase, error) { + sb := &SegmentBase{ + mem: mem, + memCRC: memCRC, + chunkMode: chunkMode, + fieldsMap: fieldsMap, + fieldsInv: fieldsInv, + numDocs: numDocs, + storedIndexOffset: storedIndexOffset, + fieldsIndexOffset: fieldsIndexOffset, + docValueOffset: docValueOffset, + dictLocs: dictLocs, + fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), + } + sb.updateSize() + + err := sb.loadDvReaders() + if err != nil { + return nil, err + } + + return sb, nil +} diff --git a/vendor/github.com/blevesearch/zap/v14/chunk.go b/vendor/github.com/blevesearch/zap/v14/chunk.go new file mode 100644 index 000000000..4307d0ed2 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/chunk.go @@ -0,0 +1,67 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" +) + +// LegacyChunkMode was the original chunk mode (always chunk size 1024) +// this mode is still used for chunking doc values. +var LegacyChunkMode uint32 = 1024 + +// DefaultChunkMode is the most recent improvement to chunking and should +// be used by default. +var DefaultChunkMode uint32 = 1026 + +func getChunkSize(chunkMode uint32, cardinality uint64, maxDocs uint64) (uint64, error) { + switch { + // any chunkMode <= 1024 will always chunk with chunkSize=chunkMode + case chunkMode <= 1024: + // legacy chunk size + return uint64(chunkMode), nil + + case chunkMode == 1025: + // attempt at simple improvement + // theory - the point of chunking is to put a bound on the maximum number of + // calls to Next() needed to find a random document. ie, you should be able + // to do one jump to the correct chunk, and then walk through at most + // chunk-size items + // previously 1024 was chosen as the chunk size, but this is particularly + // wasteful for low cardinality terms. the observation is that if there + // are less than 1024 items, why not put them all in one chunk, + // this way you'll still achieve the same goal of visiting at most + // chunk-size items. + // no attempt is made to tweak any other case + if cardinality <= 1024 { + return maxDocs, nil + } + return 1024, nil + + case chunkMode == 1026: + // improve upon the ideas tested in chunkMode 1025 + // the observation that the fewest number of dense chunks is the most + // desirable layout, given the built-in assumptions of chunking + // (that we want to put an upper-bound on the number of items you must + // walk over without skipping, currently tuned to 1024) + // + // 1. compute the number of chunks needed (max 1024/chunk) + // 2. convert to chunkSize, dividing into maxDocs + numChunks := (cardinality / 1024) + 1 + chunkSize := maxDocs / numChunks + return chunkSize, nil + } + return 0, fmt.Errorf("unknown chunk mode %d", chunkMode) +} diff --git a/vendor/github.com/blevesearch/zap/v14/contentcoder.go b/vendor/github.com/blevesearch/zap/v14/contentcoder.go new file mode 100644 index 000000000..c145b5a11 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/contentcoder.go @@ -0,0 +1,243 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + "reflect" + + "github.com/golang/snappy" +) + +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + +var termSeparator byte = 0xff +var termSeparatorSplitSlice = []byte{termSeparator} + +type chunkedContentCoder struct { + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + + w io.Writer + progressiveWrite bool + + chunkMetaBuf bytes.Buffer + chunkBuf bytes.Buffer + + chunkMeta []MetaData + + compressed []byte // temp buf for snappy compression +} + +// MetaData represents the data information inside a +// chunk. +type MetaData struct { + DocNum uint64 // docNum of the data inside the chunk + DocDvOffset uint64 // offset of data inside the chunk for the given docid +} + +// newChunkedContentCoder returns a new chunk content coder which +// packs data into chunks based on the provided chunkSize +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, + w io.Writer, progressiveWrite bool) *chunkedContentCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedContentCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: make([]MetaData, 0, total), + w: w, + progressiveWrite: progressiveWrite, + } + + return rv +} + +// Reset lets you reuse this chunked content coder. Buffers are reset +// and re used. You cannot change the chunk size. +func (c *chunkedContentCoder) Reset() { + c.currChunk = 0 + c.final = c.final[:0] + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } + c.chunkMeta = c.chunkMeta[:0] +} + +func (c *chunkedContentCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } + if cap(c.chunkMeta) < total { + c.chunkMeta = make([]MetaData, 0, total) + } +} + +// Close indicates you are done calling Add() this allows +// the final chunk to be encoded. +func (c *chunkedContentCoder) Close() error { + return c.flushContents() +} + +func (c *chunkedContentCoder) flushContents() error { + // flush the contents, with meta information at first + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(len(c.chunkMeta))) + _, err := c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + // write out the metaData slice + for _, meta := range c.chunkMeta { + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) + if err != nil { + return err + } + } + + // write the metadata to final data + metaData := c.chunkMetaBuf.Bytes() + c.final = append(c.final, c.chunkMetaBuf.Bytes()...) + // write the compressed data to the final data + c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) + c.final = append(c.final, c.compressed...) + + c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) + + if c.progressiveWrite { + _, err := c.w.Write(c.final) + if err != nil { + return err + } + c.final = c.final[:0] + } + + return nil +} + +// Add encodes the provided byte slice into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // flush out the previous chunk details + err := c.flushContents() + if err != nil { + return err + } + // clearing the chunk specific meta for next chunk + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + c.chunkMeta = c.chunkMeta[:0] + c.currChunk = chunk + } + + // get the starting offset for this doc + dvOffset := c.chunkBuf.Len() + dvSize, err := c.chunkBuf.Write(vals) + if err != nil { + return err + } + + c.chunkMeta = append(c.chunkMeta, MetaData{ + DocNum: docNum, + DocDvOffset: uint64(dvOffset + dvSize), + }) + return nil +} + +// Write commits all the encoded chunked contents to the provided writer. +// +// | ..... data ..... | chunk offsets (varints) +// | position of chunk offsets (uint64) | number of offsets (uint64) | +// +func (c *chunkedContentCoder) Write() (int, error) { + var tw int + + if c.final != nil { + // write out the data section first + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsStart := uint64(tw) + + if cap(c.final) < binary.MaxVarintLen64 { + c.final = make([]byte, binary.MaxVarintLen64) + } else { + c.final = c.final[0:binary.MaxVarintLen64] + } + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(c.final, chunkOffset) + nw, err := c.w.Write(c.final[:n]) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsLen := uint64(tw) - chunkOffsetsStart + + c.final = c.final[0:8] + // write out the length of chunk offsets + binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + // write out the number of chunks + binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) + nw, err = c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + c.final = c.final[:0] + + return tw, nil +} + +// ReadDocValueBoundary elicits the start, end offsets from a +// metaData header slice +func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = metaHeaders[chunk-1].DocDvOffset + } + return start, metaHeaders[chunk].DocDvOffset +} diff --git a/vendor/github.com/blevesearch/zap/v14/count.go b/vendor/github.com/blevesearch/zap/v14/count.go new file mode 100644 index 000000000..50290f888 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/count.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "hash/crc32" + "io" + + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// CountHashWriter is a wrapper around a Writer which counts the number of +// bytes which have been written and computes a crc32 hash +type CountHashWriter struct { + w io.Writer + crc uint32 + n int + s segment.StatsReporter +} + +// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer +func NewCountHashWriter(w io.Writer) *CountHashWriter { + return &CountHashWriter{w: w} +} + +func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { + return &CountHashWriter{w: w, s: s} +} + +// Write writes the provided bytes to the wrapped writer and counts the bytes +func (c *CountHashWriter) Write(b []byte) (int, error) { + n, err := c.w.Write(b) + c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) + c.n += n + if c.s != nil { + c.s.ReportBytesWritten(uint64(n)) + } + return n, err +} + +// Count returns the number of bytes written +func (c *CountHashWriter) Count() int { + return c.n +} + +// Sum32 returns the CRC-32 hash of the content written to this writer +func (c *CountHashWriter) Sum32() uint32 { + return c.crc +} diff --git a/vendor/github.com/blevesearch/zap/v14/dict.go b/vendor/github.com/blevesearch/zap/v14/dict.go new file mode 100644 index 000000000..ad4a8f8dc --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/dict.go @@ -0,0 +1,263 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "fmt" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" +) + +// Dictionary is the zap representation of the term dictionary +type Dictionary struct { + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + fstReader *vellum.Reader +} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { + var preallocPL *PostingsList + pl, ok := prealloc.(*PostingsList) + if ok && pl != nil { + preallocPL = pl + } + return d.postingsList(term, except, preallocPL) +} + +func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + if d.fstReader == nil { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + postingsOffset, exists, err := d.fstReader.Get(term) + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + return d.postingsListFromOffset(postingsOffset, except, rv) +} + +func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + rv = d.postingsListInit(rv, except) + + err := rv.read(postingsOffset, d) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { + if rv == nil || rv == emptyPostingsList { + rv = &PostingsList{} + } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + + *rv = PostingsList{} // clear the struct + + rv.postings = postings + } + rv.sb = d.sb + rv.except = except + return rv +} + +func (d *Dictionary) Contains(key []byte) (bool, error) { + return d.fst.Contains(key) +} + +// Iterator returns an iterator for this dictionary +func (d *Dictionary) Iterator() segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Iterator(nil, nil) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// PrefixIterator returns an iterator which only visits terms having the +// the specified prefix +func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + kBeg := []byte(prefix) + kEnd := segment.IncrementBytes(kBeg) + + if d.fst != nil { + itr, err := d.fst.Iterator(kBeg, kEnd) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// RangeIterator returns an iterator which only visits terms between the +// start and end terms. NOTE: bleve.index API specifies the end is inclusive. +func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + // need to increment the end position to be inclusive + var endBytes []byte + if len(end) > 0 { + endBytes = []byte(end) + if endBytes[len(endBytes)-1] < 0xff { + endBytes[len(endBytes)-1]++ + } else { + endBytes = append(endBytes, 0xff) + } + } + + if d.fst != nil { + itr, err := d.fst.Iterator([]byte(start), endBytes) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (d *Dictionary) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + +func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, + includeCount bool) segment.DictionaryIterator { + + rv := &DictionaryIterator{ + d: d, + omitCount: !includeCount, + } + + var buf bytes.Buffer + builder, err := vellum.New(&buf, nil) + if err != nil { + rv.err = err + return rv + } + for _, term := range onlyTerms { + err = builder.Insert(term, 0) + if err != nil { + rv.err = err + return rv + } + } + err = builder.Close() + if err != nil { + rv.err = err + return rv + } + + onlyFST, err := vellum.Load(buf.Bytes()) + if err != nil { + rv.err = err + return rv + } + + itr, err := d.fst.Search(onlyFST, nil, nil) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + + return rv +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry + omitCount bool +} + +// Next returns the next entry in the dictionary +func (i *DictionaryIterator) Next() (*index.DictEntry, error) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } + term, postingsOffset := i.itr.Current() + i.entry.Term = string(term) + if !i.omitCount { + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } + i.entry.Count = i.tmp.Count() + } + i.err = i.itr.Next() + return &i.entry, nil +} diff --git a/vendor/github.com/blevesearch/zap/v14/docvalues.go b/vendor/github.com/blevesearch/zap/v14/docvalues.go new file mode 100644 index 000000000..793797bd8 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/docvalues.go @@ -0,0 +1,312 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "reflect" + "sort" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" + "github.com/golang/snappy" +) + +var reflectStaticSizedocValueReader int + +func init() { + var dvi docValueReader + reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) +} + +type docNumTermsVisitor func(docNum uint64, terms []byte) error + +type docVisitState struct { + dvrs map[uint16]*docValueReader + segment *SegmentBase +} + +type docValueReader struct { + field string + curChunkNum uint64 + chunkOffsets []uint64 + dvDataLoc uint64 + curChunkHeader []MetaData + curChunkData []byte // compressed data cache + uncompressed []byte // temp buf for snappy decompression +} + +func (di *docValueReader) size() int { + return reflectStaticSizedocValueReader + size.SizeOfPtr + + len(di.field) + + len(di.chunkOffsets)*size.SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) +} + +func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { + if rv == nil { + rv = &docValueReader{} + } + + rv.field = di.field + rv.curChunkNum = math.MaxUint64 + rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable + rv.dvDataLoc = di.dvDataLoc + rv.curChunkHeader = rv.curChunkHeader[:0] + rv.curChunkData = nil + rv.uncompressed = rv.uncompressed[:0] + + return rv +} + +func (di *docValueReader) curChunkNumber() uint64 { + return di.curChunkNum +} + +func (s *SegmentBase) loadFieldDocValueReader(field string, + fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { + // get the docValue offset for the given fields + if fieldDvLocStart == fieldNotUninverted { + // no docValues found, nothing to do + return nil, nil + } + + // read the number of chunks, and chunk offsets position + var numChunks, chunkOffsetsPosition uint64 + + if fieldDvLocEnd-fieldDvLocStart > 16 { + numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen + } else { + return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) + } + + fdvIter := &docValueReader{ + curChunkNum: math.MaxUint64, + field: field, + chunkOffsets: make([]uint64, int(numChunks)), + } + + // read the chunk offsets + var offset uint64 + for i := 0; i < int(numChunks); i++ { + loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) + if read <= 0 { + return nil, fmt.Errorf("corrupted chunk offset during segment load") + } + fdvIter.chunkOffsets[i] = loc + offset += uint64(read) + } + + // set the data offset + fdvIter.dvDataLoc = fieldDvLocStart + + return fdvIter, nil +} + +func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { + // advance to the chunk where the docValues + // reside for the given docNum + destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc + start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } + + destChunkDataLoc += start + curChunkEnd += end + + // read the number of docs reside in the chunk + numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("failed to read the chunk") + } + chunkMetaLoc := destChunkDataLoc + uint64(read) + + offset := uint64(0) + if cap(di.curChunkHeader) < int(numDocs) { + di.curChunkHeader = make([]MetaData, int(numDocs)) + } else { + di.curChunkHeader = di.curChunkHeader[:int(numDocs)] + } + for i := 0; i < int(numDocs); i++ { + di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + } + + compressedDataLoc := chunkMetaLoc + offset + dataLength := curChunkEnd - compressedDataLoc + di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil +} + +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + if di.curChunkData == nil || len(di.curChunkHeader) == 0 { + continue + } + + // uncompress the already loaded data + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + + return nil +} + +func (di *docValueReader) visitDocValues(docNum uint64, + visitor index.DocumentFieldTermVisitor) error { + // binary search the term locations for the docNum + start, end := di.getDocValueLocs(docNum) + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { + return nil + } + + var uncompressed []byte + var err error + // use the uncompressed copy if available + if len(di.uncompressed) > 0 { + uncompressed = di.uncompressed + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + } + + // pick the terms for the given docNum + uncompressed = uncompressed[start:end] + for { + i := bytes.Index(uncompressed, termSeparatorSplitSlice) + if i < 0 { + break + } + + visitor(di.field, uncompressed[0:i]) + uncompressed = uncompressed[i+1:] + } + + return nil +} + +func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { + i := sort.Search(len(di.curChunkHeader), func(i int) bool { + return di.curChunkHeader[i].DocNum >= docNum + }) + if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { + return ReadDocValueBoundary(i, di.curChunkHeader) + } + return math.MaxUint64, math.MaxUint64 +} + +// VisitDocumentFieldTerms is an implementation of the +// DocumentFieldTermVisitable interface +func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( + segment.DocVisitState, error) { + dvs, ok := dvsIn.(*docVisitState) + if !ok || dvs == nil { + dvs = &docVisitState{} + } else { + if dvs.segment != s { + dvs.segment = s + dvs.dvrs = nil + } + } + + var fieldIDPlus1 uint16 + if dvs.dvrs == nil { + dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvIter, exists := s.fieldDvReaders[fieldID]; exists && + dvIter != nil { + dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) + } + } + } + + // find the chunkNumber where the docValues are stored + // NOTE: doc values continue to use legacy chunk mode + chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, err + } + docInChunk := localDocNum / chunkFactor + var dvr *docValueReader + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { + // check if the chunk is already loaded + if docInChunk != dvr.curChunkNumber() { + err := dvr.loadDvChunk(docInChunk, s) + if err != nil { + return dvs, err + } + } + + _ = dvr.visitDocValues(localDocNum, visitor) + } + } + return dvs, nil +} + +// VisitableDocValueFields returns the list of fields with +// persisted doc value terms ready to be visitable using the +// VisitDocumentFieldTerms method. +func (s *SegmentBase) VisitableDocValueFields() ([]string, error) { + return s.fieldDvNames, nil +} diff --git a/vendor/github.com/blevesearch/zap/v14/enumerator.go b/vendor/github.com/blevesearch/zap/v14/enumerator.go new file mode 100644 index 000000000..bc5b7e627 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/enumerator.go @@ -0,0 +1,138 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + + "github.com/couchbase/vellum" +) + +// enumerator provides an ordered traversal of multiple vellum +// iterators. Like JOIN of iterators, the enumerator produces a +// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC, +// then iteratorIndex ASC, where the same key might be seen or +// repeated across multiple child iterators. +type enumerator struct { + itrs []vellum.Iterator + currKs [][]byte + currVs []uint64 + + lowK []byte + lowIdxs []int + lowCurr int +} + +// newEnumerator returns a new enumerator over the vellum Iterators +func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { + rv := &enumerator{ + itrs: itrs, + currKs: make([][]byte, len(itrs)), + currVs: make([]uint64, len(itrs)), + lowIdxs: make([]int, 0, len(itrs)), + } + for i, itr := range rv.itrs { + rv.currKs[i], rv.currVs[i] = itr.Current() + } + rv.updateMatches(false) + if rv.lowK == nil && len(rv.lowIdxs) == 0 { + return rv, vellum.ErrIteratorDone + } + return rv, nil +} + +// updateMatches maintains the low key matches based on the currKs +func (m *enumerator) updateMatches(skipEmptyKey bool) { + m.lowK = nil + m.lowIdxs = m.lowIdxs[:0] + m.lowCurr = 0 + + for i, key := range m.currKs { + if (key == nil && m.currVs[i] == 0) || // in case of empty iterator + (len(key) == 0 && skipEmptyKey) { // skip empty keys + continue + } + + cmp := bytes.Compare(key, m.lowK) + if cmp < 0 || len(m.lowIdxs) == 0 { + // reached a new low + m.lowK = key + m.lowIdxs = m.lowIdxs[:0] + m.lowIdxs = append(m.lowIdxs, i) + } else if cmp == 0 { + m.lowIdxs = append(m.lowIdxs, i) + } + } +} + +// Current returns the enumerator's current key, iterator-index, and +// value. If the enumerator is not pointing at a valid value (because +// Next returned an error previously), Current will return nil,0,0. +func (m *enumerator) Current() ([]byte, int, uint64) { + var i int + var v uint64 + if m.lowCurr < len(m.lowIdxs) { + i = m.lowIdxs[m.lowCurr] + v = m.currVs[i] + } + return m.lowK, i, v +} + +// GetLowIdxsAndValues will return all of the iterator indices +// which point to the current key, and their corresponding +// values. This can be used by advanced caller which may need +// to peek into these other sets of data before processing. +func (m *enumerator) GetLowIdxsAndValues() ([]int, []uint64) { + values := make([]uint64, 0, len(m.lowIdxs)) + for _, idx := range m.lowIdxs { + values = append(values, m.currVs[idx]) + } + return m.lowIdxs, values +} + +// Next advances the enumerator to the next key/iterator/value result, +// else vellum.ErrIteratorDone is returned. +func (m *enumerator) Next() error { + m.lowCurr += 1 + if m.lowCurr >= len(m.lowIdxs) { + // move all the current low iterators forwards + for _, vi := range m.lowIdxs { + err := m.itrs[vi].Next() + if err != nil && err != vellum.ErrIteratorDone { + return err + } + m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() + } + // can skip any empty keys encountered at this point + m.updateMatches(true) + } + if m.lowK == nil && len(m.lowIdxs) == 0 { + return vellum.ErrIteratorDone + } + return nil +} + +// Close all the underlying Iterators. The first error, if any, will +// be returned. +func (m *enumerator) Close() error { + var rv error + for _, itr := range m.itrs { + err := itr.Close() + if rv == nil { + rv = err + } + } + return rv +} diff --git a/vendor/github.com/blevesearch/zap/v14/go.mod b/vendor/github.com/blevesearch/zap/v14/go.mod new file mode 100644 index 000000000..fe25dbdc0 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/go.mod @@ -0,0 +1,12 @@ +module github.com/blevesearch/zap/v14 + +go 1.12 + +require ( + github.com/RoaringBitmap/roaring v0.4.23 + github.com/blevesearch/bleve v1.0.10 + github.com/blevesearch/mmap-go v1.0.2 + github.com/couchbase/vellum v1.0.2 + github.com/golang/snappy v0.0.1 + github.com/spf13/cobra v0.0.5 +) diff --git a/vendor/github.com/blevesearch/zap/v14/intDecoder.go b/vendor/github.com/blevesearch/zap/v14/intDecoder.go new file mode 100644 index 000000000..3baa0c2a6 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/intDecoder.go @@ -0,0 +1,118 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type chunkedIntDecoder struct { + startOffset uint64 + dataStartOffset uint64 + chunkOffsets []uint64 + curChunkBytes []byte + data []byte + r *segment.MemUvarintReader +} + +// newChunkedIntDecoder expects an optional or reset chunkedIntDecoder for better reuse. +func newChunkedIntDecoder(buf []byte, offset uint64, rv *chunkedIntDecoder) *chunkedIntDecoder { + if rv == nil { + rv = &chunkedIntDecoder{startOffset: offset, data: buf} + } else { + rv.startOffset = offset + rv.data = buf + } + + var n, numChunks uint64 + var read int + if offset == termNotEncoded { + numChunks = 0 + } else { + numChunks, read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + } + + n += uint64(read) + if cap(rv.chunkOffsets) >= int(numChunks) { + rv.chunkOffsets = rv.chunkOffsets[:int(numChunks)] + } else { + rv.chunkOffsets = make([]uint64, int(numChunks)) + } + for i := 0; i < int(numChunks); i++ { + rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.dataStartOffset = offset + n + return rv +} + +func (d *chunkedIntDecoder) loadChunk(chunk int) error { + if d.startOffset == termNotEncoded { + d.r = segment.NewMemUvarintReader([]byte(nil)) + return nil + } + + if chunk >= len(d.chunkOffsets) { + return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", + chunk, len(d.chunkOffsets)) + } + + end, start := d.dataStartOffset, d.dataStartOffset + s, e := readChunkBoundary(chunk, d.chunkOffsets) + start += s + end += e + d.curChunkBytes = d.data[start:end] + if d.r == nil { + d.r = segment.NewMemUvarintReader(d.curChunkBytes) + } else { + d.r.Reset(d.curChunkBytes) + } + + return nil +} + +func (d *chunkedIntDecoder) reset() { + d.startOffset = 0 + d.dataStartOffset = 0 + d.chunkOffsets = d.chunkOffsets[:0] + d.curChunkBytes = d.curChunkBytes[:0] + d.data = d.data[:0] + if d.r != nil { + d.r.Reset([]byte(nil)) + } +} + +func (d *chunkedIntDecoder) isNil() bool { + return d.curChunkBytes == nil || len(d.curChunkBytes) == 0 +} + +func (d *chunkedIntDecoder) readUvarint() (uint64, error) { + return d.r.ReadUvarint() +} + +func (d *chunkedIntDecoder) SkipUvarint() { + d.r.SkipUvarint() +} + +func (d *chunkedIntDecoder) SkipBytes(count int) { + d.r.SkipBytes(count) +} + +func (d *chunkedIntDecoder) Len() int { + return d.r.Len() +} diff --git a/vendor/github.com/blevesearch/zap/v14/intcoder.go b/vendor/github.com/blevesearch/zap/v14/intcoder.go new file mode 100644 index 000000000..c3c488fb7 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/intcoder.go @@ -0,0 +1,206 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" +) + +// We can safely use 0 to represent termNotEncoded since 0 +// could never be a valid address for term location information. +// (stored field index is always non-empty and earlier in the +// file) +const termNotEncoded = 0 + +type chunkedIntCoder struct { + final []byte + chunkSize uint64 + chunkBuf bytes.Buffer + chunkLens []uint64 + currChunk uint64 + + buf []byte +} + +// newChunkedIntCoder returns a new chunk int coder which packs data into +// chunks based on the provided chunkSize and supports up to the specified +// maxDocNum +func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedIntCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + final: make([]byte, 0, 64), + } + + return rv +} + +// Reset lets you reuse this chunked int coder. buffers are reset and reused +// from previous use. you cannot change the chunk size or max doc num. +func (c *chunkedIntCoder) Reset() { + c.final = c.final[:0] + c.chunkBuf.Reset() + c.currChunk = 0 + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } +} + +// SetChunkSize changes the chunk size. It is only valid to do so +// with a new chunkedIntCoder, or immediately after calling Reset() +func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } +} + +// Add encodes the provided integers into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + if len(c.buf) < binary.MaxVarintLen64 { + c.buf = make([]byte, binary.MaxVarintLen64) + } + + for _, val := range vals { + wb := binary.PutUvarint(c.buf, val) + _, err := c.chunkBuf.Write(c.buf[:wb]) + if err != nil { + return err + } + } + + return nil +} + +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + +// Close indicates you are done calling Add() this allows the final chunk +// to be encoded. +func (c *chunkedIntCoder) Close() { + encodingBytes := c.chunkBuf.Bytes() + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) + c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close +} + +// Write commits all the encoded chunked integers to the provided writer. +func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { + bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens)) + if len(c.buf) < bufNeeded { + c.buf = make([]byte, bufNeeded) + } + buf := c.buf + + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) + } + + tw, err := w.Write(buf[:n]) + if err != nil { + return tw, err + } + + // write out the data + nw, err := w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} + +// writeAt commits all the encoded chunked integers to the provided writer +// and returns the starting offset, total bytes written and an error +func (c *chunkedIntCoder) writeAt(w io.Writer) (uint64, int, error) { + startOffset := uint64(termNotEncoded) + if len(c.final) <= 0 { + return startOffset, 0, nil + } + + if chw := w.(*CountHashWriter); chw != nil { + startOffset = uint64(chw.Count()) + } + + tw, err := c.Write(w) + return startOffset, tw, err +} + +func (c *chunkedIntCoder) FinalSize() int { + return len(c.final) +} + +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary +// will figure out the start and end of every chunk from +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ + } + return lengths +} + +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = offsets[chunk-1] + } + return start, offsets[chunk] +} diff --git a/vendor/github.com/blevesearch/zap/v14/merge.go b/vendor/github.com/blevesearch/zap/v14/merge.go new file mode 100644 index 000000000..805100fb5 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/merge.go @@ -0,0 +1,847 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "math" + "os" + "sort" + + "github.com/RoaringBitmap/roaring" + seg "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +var DefaultFileMergerBufferSize = 1024 * 1024 + +const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc + +// Merge takes a slice of segments and bit masks describing which +// documents may be dropped, and creates a new segment containing the +// remaining data. This new segment is built at the specified path. +func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + switch segmentx := segment.(type) { + case *Segment: + segmentBases[segmenti] = &segmentx.SegmentBase + case *SegmentBase: + segmentBases[segmenti] = segmentx + default: + panic(fmt.Sprintf("oops, unexpected segment type: %T", segment)) + } + } + return mergeSegmentBases(segmentBases, drops, path, DefaultChunkMode, closeCh, s) +} + +func mergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, + chunkMode uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return nil, 0, err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + // buffer the output + br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriterWithStatsReporter(br, s) + + newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := + MergeToWriter(segmentBases, drops, chunkMode, cr, closeCh) + if err != nil { + cleanup() + return nil, 0, err + } + + err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, + docValueOffset, chunkMode, cr.Sum32(), cr) + if err != nil { + cleanup() + return nil, 0, err + } + + err = br.Flush() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Sync() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Close() + if err != nil { + cleanup() + return nil, 0, err + } + + return newDocNums, uint64(cr.Count()), nil +} + +func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, + chunkMode uint32, cr *CountHashWriter, closeCh chan struct{}) ( + newDocNums [][]uint64, + numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, + err error) { + docValueOffset = uint64(fieldNotUninverted) + + var fieldsSame bool + fieldsSame, fieldsInv = mergeFields(segments) + fieldsMap = mapFields(fieldsInv) + + numDocs = computeNewDocCount(segments, drops) + + if isClosed(closeCh) { + return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed + } + + if numDocs > 0 { + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) + if err != nil { + return nil, 0, 0, 0, 0, nil, nil, nil, err + } + + dictLocs, docValueOffset, err = persistMergedRest(segments, drops, + fieldsInv, fieldsMap, fieldsSame, + newDocNums, numDocs, chunkMode, cr, closeCh) + if err != nil { + return nil, 0, 0, 0, 0, nil, nil, nil, err + } + } else { + dictLocs = make([]uint64, len(fieldsInv)) + } + + fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) + if err != nil { + return nil, 0, 0, 0, 0, nil, nil, nil, err + } + + return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, nil +} + +// mapFields takes the fieldsInv list and returns a map of fieldName +// to fieldID+1 +func mapFields(fields []string) map[string]uint16 { + rv := make(map[string]uint16, len(fields)) + for i, fieldName := range fields { + rv[fieldName] = uint16(i) + 1 + } + return rv +} + +// computeNewDocCount determines how many documents will be in the newly +// merged segment when obsoleted docs are dropped +func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 { + var newDocCount uint64 + for segI, segment := range segments { + newDocCount += segment.numDocs + if drops[segI] != nil { + newDocCount -= drops[segI].GetCardinality() + } + } + return newDocCount +} + +func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, + newDocNumsIn [][]uint64, newSegDocCount uint64, chunkMode uint32, + w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { + + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 + + var postings *PostingsList + var postItr *PostingsIterator + + rv := make([]uint64, len(fieldsInv)) + fieldDvLocsStart := make([]uint64, len(fieldsInv)) + fieldDvLocsEnd := make([]uint64, len(fieldsInv)) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + + var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, 0, err + } + + newRoaring := roaring.NewBitmap() + + // for each field + for fieldID, fieldName := range fieldsInv { + + // collect FST iterators from all active segments for this field + var newDocNums [][]uint64 + var drops []*roaring.Bitmap + var dicts []*Dictionary + var itrs []vellum.Iterator + + var segmentsInFocus []*SegmentBase + + for segmentI, segment := range segments { + + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + dict, err2 := segment.dictionary(fieldName) + if err2 != nil { + return nil, 0, err2 + } + if dict != nil && dict.fst != nil { + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil && err2 != vellum.ErrIteratorDone { + return nil, 0, err2 + } + if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } + dicts = append(dicts, dict) + itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) + } + } + } + + var prevTerm []byte + + newRoaring.Clear() + + var lastDocNum, lastFreq, lastNorm uint64 + + // determines whether to use "1-hit" encoding optimization + // when a term appears in only 1 doc, with no loc info, + // has freq of 1, and the docNum fits into 31-bits + use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { + if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { + docNum := uint64(newRoaring.Minimum()) + if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { + return true, docNum, lastNorm + } + } + return false, 0, 0 + } + + finishTerm := func(term []byte) error { + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + if err != nil { + return err + } + + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) + if err != nil { + return err + } + } + + newRoaring.Clear() + + tfEncoder.Reset() + locEncoder.Reset() + + lastDocNum = 0 + lastFreq = 0 + lastNorm = 0 + + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + // if the term changed, write out the info collected + // for the previous term + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + } + if !bytes.Equal(prevTerm, term) || prevTerm == nil { + // compute cardinality of field-term in new seg + var newCard uint64 + lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues() + for i, idx := range lowItrIdxs { + pl, err := dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil) + if err != nil { + return nil, 0, err + } + newCard += pl.Count() + } + // compute correct chunk size with this + chunkSize, err := getChunkSize(chunkMode, newCard, newSegDocCount) + if err != nil { + return nil, 0, err + } + // update encoders chunk + tfEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + locEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + } + + postings, err = dicts[itrI].postingsListFromOffset( + postingsOffset, drops[itrI], postings) + if err != nil { + return nil, 0, err + } + + postItr = postings.iterator(true, true, true, postItr) + + // can no longer optimize by copying, since chunk factor could have changed + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder, bufLoc) + + if err != nil { + return nil, 0, err + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) + + err = enumerator.Next() + } + if err != vellum.ErrIteratorDone { + return nil, 0, err + } + + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + + dictOffset := uint64(w.Count()) + + err = newVellum.Close() + if err != nil { + return nil, 0, err + } + vellumData := vellumBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, 0, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, 0, err + } + + rv[fieldID] = dictOffset + + // get the field doc value offset (start) + fieldDvLocsStart[fieldID] = uint64(w.Count()) + + // update the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, 0, err + } + fdvEncoder := newChunkedContentCoder(chunkSize, newSegDocCount-1, w, true) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + for segmentI, segment := range segmentsInFocus { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && + dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) + if err != nil { + return nil, 0, err + } + } + } + + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } + + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, 0, err + } + } + + fieldDvLocsOffset := uint64(w.Count()) + + buf := bufMaxVarintLen64 + for i := 0; i < len(fieldDvLocsStart); i++ { + n := binary.PutUvarint(buf, fieldDvLocsStart[i]) + _, err := w.Write(buf[:n]) + if err != nil { + return nil, 0, err + } + n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, 0, err + } + } + + return rv, fieldDvLocsOffset, nil +} + +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + nextNorm := uint64(math.Float32bits(float32(next.Norm()))) + + locs := next.Locations() + + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + if err != nil { + return 0, 0, 0, nil, err + } + + if len(locs) > 0 { + numBytesLocs := 0 + for _, loc := range locs { + ap := loc.ArrayPositions() + numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), + loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) + } + + err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) + if err != nil { + return 0, 0, 0, nil, err + } + + for _, loc := range locs { + ap := loc.ArrayPositions() + if cap(bufLoc) < 5+len(ap) { + bufLoc = make([]uint64, 0, 5+len(ap)) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(ap)) + args = append(args, ap...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, + use1HitEncoding func(uint64) (bool, uint64, uint64), + w *CountHashWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + if use1HitEncoding != nil { + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil + } + } + + var tfOffset uint64 + tfOffset, _, err = tfEncoder.writeAt(w) + if err != nil { + return 0, err + } + + var locOffset uint64 + locOffset, _, err = locEncoder.writeAt(w) + if err != nil { + return 0, err + } + + postingsOffset := uint64(w.Count()) + + n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +type varintEncoder func(uint64) (int, error) + +func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, + fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, + w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { + var rv [][]uint64 // The remapped or newDocNums for each segment. + + var newDocNum uint64 + + var curr int + var data, compressed []byte + var metaBuf bytes.Buffer + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return metaBuf.Write(varBuf[:wb]) + } + + vals := make([][][]byte, len(fieldsInv)) + typs := make([][]byte, len(fieldsInv)) + poss := make([][][]uint64, len(fieldsInv)) + + var posBuf []uint64 + + docNumOffsets := make([]uint64, newSegDocCount) + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + + // for each segment + for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + + segNewDocNums := make([]uint64, segment.numDocs) + + dropsI := drops[segI] + + // optimize when the field mapping is the same across all + // segments and there are no deletions, via byte-copying + // of stored docs bytes directly to the writer + if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) { + err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) + if err != nil { + return 0, nil, err + } + + for i := uint64(0); i < segment.numDocs; i++ { + segNewDocNums[i] = newDocNum + newDocNum++ + } + rv = append(rv, segNewDocNums) + + continue + } + + // for each doc num + for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + // TODO: roaring's API limits docNums to 32-bits? + if dropsI != nil && dropsI.Contains(uint32(docNum)) { + segNewDocNums[docNum] = docDropped + continue + } + + segNewDocNums[docNum] = newDocNum + + curr = 0 + metaBuf.Reset() + data = data[:0] + + posTemp := posBuf + + // collect all the data + for i := 0; i < len(fieldsInv); i++ { + vals[i] = vals[i][:0] + typs[i] = typs[i][:0] + poss[i] = poss[i][:0] + } + err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := int(fieldsMap[field]) - 1 + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + + // copy array positions to preserve them beyond the scope of this callback + var curPos []uint64 + if len(pos) > 0 { + if cap(posTemp) < len(pos) { + posBuf = make([]uint64, len(pos)*len(fieldsInv)) + posTemp = posBuf + } + curPos = posTemp[0:len(pos)] + copy(curPos, pos) + posTemp = posTemp[len(pos):] + } + poss[fieldID] = append(poss[fieldID], curPos) + + return true + }) + if err != nil { + return 0, nil, err + } + + // _id field special case optimizes ExternalID() lookups + idFieldVal := vals[uint16(0)][0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, nil, err + } + + // now walk the non-"_id" fields in order + for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { + storedFieldValues := vals[fieldID] + + stf := typs[fieldID] + spf := poss[fieldID] + + var err2 error + curr, data, err2 = persistStoredFieldValues(fieldID, + storedFieldValues, stf, spf, curr, metaEncode, data) + if err2 != nil { + return 0, nil, err2 + } + } + + metaBytes := metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) + + // write out the meta len and compressed data len + _, err = writeUvarints(w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, nil, err + } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, nil, err + } + // now write the _id field val (counted as part of the 'compressed' data) + _, err = w.Write(idFieldVal) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ + } + + rv = append(rv, segNewDocNums) + } + + // return value is the start of the stored index + storedIndexOffset := uint64(w.Count()) + + // now write out the stored doc index + for _, docNumOffset := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffset) + if err != nil { + return 0, nil, err + } + } + + return storedIndexOffset, rv, nil +} + +// copyStoredDocs writes out a segment's stored doc info, optimized by +// using a single Write() call for the entire set of bytes. The +// newDocNumOffsets is filled with the new offsets for each doc. +func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64, + w *CountHashWriter) error { + if s.numDocs <= 0 { + return nil + } + + indexOffset0, storedOffset0, _, _, _ := + s.getDocStoredOffsets(0) // the segment's first doc + + indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN := + s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc + + storedOffset0New := uint64(w.Count()) + + storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN] + _, err := w.Write(storedBytes) + if err != nil { + return err + } + + // remap the storedOffset's for the docs into new offsets relative + // to storedOffset0New, filling the given docNumOffsetsOut array + for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 { + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New + newDocNumOffsets[newDocNum] = storedOffsetNew + newDocNum += 1 + } + + return nil +} + +// mergeFields builds a unified list of fields used across all the +// input segments, and computes whether the fields are the same across +// segments (which depends on fields to be sorted in the same way +// across segments) +func mergeFields(segments []*SegmentBase) (bool, []string) { + fieldsSame := true + + var segment0Fields []string + if len(segments) > 0 { + segment0Fields = segments[0].Fields() + } + + fieldsExist := map[string]struct{}{} + for _, segment := range segments { + fields := segment.Fields() + for fieldi, field := range fields { + fieldsExist[field] = struct{}{} + if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { + fieldsSame = false + } + } + } + + rv := make([]string, 0, len(fieldsExist)) + // ensure _id stays first + rv = append(rv, "_id") + for k := range fieldsExist { + if k != "_id" { + rv = append(rv, k) + } + } + + sort.Strings(rv[1:]) // leave _id as first + + return fieldsSame, rv +} + +func isClosed(closeCh chan struct{}) bool { + select { + case <-closeCh: + return true + default: + return false + } +} diff --git a/vendor/github.com/blevesearch/zap/v14/new.go b/vendor/github.com/blevesearch/zap/v14/new.go new file mode 100644 index 000000000..98158186c --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/new.go @@ -0,0 +1,860 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +var NewSegmentBufferNumResultsBump int = 100 +var NewSegmentBufferNumResultsFactor float64 = 1.0 +var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 + +// ValidateDocFields can be set by applications to perform additional checks +// on fields in a document being added to a new segment, by default it does +// nothing. +// This API is experimental and may be removed at any time. +var ValidateDocFields = func(field document.Field) error { + return nil +} + +// AnalysisResultsToSegmentBase produces an in-memory zap-encoded +// SegmentBase from analysis results +func (z *ZapPlugin) New(results []*index.AnalysisResult) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode) +} + +func (*ZapPlugin) newWithChunkMode(results []*index.AnalysisResult, + chunkMode uint32) (segment.Segment, uint64, error) { + s := interimPool.Get().(*interim) + + var br bytes.Buffer + if s.lastNumDocs > 0 { + // use previous results to initialize the buf with an estimate + // size, but note that the interim instance comes from a + // global interimPool, so multiple scorch instances indexing + // different docs can lead to low quality estimates + estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * + NewSegmentBufferNumResultsFactor) + estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * + NewSegmentBufferAvgBytesPerDocFactor) + br.Grow(estimateAvgBytesPerDoc * estimateNumResults) + } + + s.results = results + s.chunkMode = chunkMode + s.w = NewCountHashWriter(&br) + + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, + err := s.convert() + if err != nil { + return nil, uint64(0), err + } + + sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, + s.FieldsMap, s.FieldsInv, uint64(len(results)), + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + + if err == nil && s.reset() == nil { + s.lastNumDocs = len(results) + s.lastOutSize = len(br.Bytes()) + interimPool.Put(s) + } + + return sb, uint64(len(br.Bytes())), err +} + +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + +// interim holds temporary working data used while converting from +// analysis results to a zap-encoded segment +type interim struct { + results []*index.AnalysisResult + + chunkMode uint32 + + w *CountHashWriter + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + // Term dictionaries for each field + // field id -> term -> postings list id + 1 + Dicts []map[string]uint64 + + // Terms for each field, where terms are sorted ascending + // field id -> []term + DictKeys [][]string + + // Fields whose IncludeDocValues is true + // field id -> bool + IncludeDocValues []bool + + // postings id -> bitmap of docNums + Postings []*roaring.Bitmap + + // postings id -> freq/norm's, one for each docNum in postings + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm + + // postings id -> locs, one for each freq + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id + + builder *vellum.Builder + builderBuf bytes.Buffer + + metaBuf bytes.Buffer + + tmp0 []byte + tmp1 []byte + + lastNumDocs int + lastOutSize int +} + +func (s *interim) reset() (err error) { + s.results = nil + s.chunkMode = 0 + s.w = nil + s.FieldsMap = nil + s.FieldsInv = nil + for i := range s.Dicts { + s.Dicts[i] = nil + } + s.Dicts = s.Dicts[:0] + for i := range s.DictKeys { + s.DictKeys[i] = s.DictKeys[i][:0] + } + s.DictKeys = s.DictKeys[:0] + for i := range s.IncludeDocValues { + s.IncludeDocValues[i] = false + } + s.IncludeDocValues = s.IncludeDocValues[:0] + for _, idn := range s.Postings { + idn.Clear() + } + s.Postings = s.Postings[:0] + s.FreqNorms = s.FreqNorms[:0] + for i := range s.freqNormsBacking { + s.freqNormsBacking[i] = interimFreqNorm{} + } + s.freqNormsBacking = s.freqNormsBacking[:0] + s.Locs = s.Locs[:0] + for i := range s.locsBacking { + s.locsBacking[i] = interimLoc{} + } + s.locsBacking = s.locsBacking[:0] + s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] + s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] + s.builderBuf.Reset() + if s.builder != nil { + err = s.builder.Reset(&s.builderBuf) + } + s.metaBuf.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + s.lastNumDocs = 0 + s.lastOutSize = 0 + + return err +} + +func (s *interim) grabBuf(size int) []byte { + buf := s.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + s.tmp0 = buf + } + return buf[0:size] +} + +type interimStoredField struct { + vals [][]byte + typs []byte + arrayposs [][]uint64 // array positions +} + +type interimFreqNorm struct { + freq uint64 + norm float32 + numLocs int +} + +type interimLoc struct { + fieldID uint16 + pos uint64 + start uint64 + end uint64 + arrayposs []uint64 +} + +func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { + s.FieldsMap = map[string]uint16{} + + s.getOrDefineField("_id") // _id field is fieldID 0 + + for _, result := range s.results { + for _, field := range result.Document.CompositeFields { + s.getOrDefineField(field.Name()) + } + for _, field := range result.Document.Fields { + s.getOrDefineField(field.Name()) + } + } + + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + if cap(s.IncludeDocValues) >= len(s.FieldsInv) { + s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] + } else { + s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + } + + s.prepareDicts() + + for _, dict := range s.DictKeys { + sort.Strings(dict) + } + + s.processDocuments() + + storedIndexOffset, err := s.writeStoredFields() + if err != nil { + return 0, 0, 0, nil, err + } + + var fdvIndexOffset uint64 + var dictOffsets []uint64 + + if len(s.results) > 0 { + fdvIndexOffset, dictOffsets, err = s.writeDicts() + if err != nil { + return 0, 0, 0, nil, err + } + } else { + dictOffsets = make([]uint64, len(s.FieldsInv)) + } + + fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) + if err != nil { + return 0, 0, 0, nil, err + } + + return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil +} + +func (s *interim) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[fieldName] = fieldIDPlus1 + s.FieldsInv = append(s.FieldsInv, fieldName) + + s.Dicts = append(s.Dicts, make(map[string]uint64)) + + n := len(s.DictKeys) + if n < cap(s.DictKeys) { + s.DictKeys = s.DictKeys[:n+1] + s.DictKeys[n] = s.DictKeys[n][:0] + } else { + s.DictKeys = append(s.DictKeys, []string(nil)) + } + } + + return int(fieldIDPlus1 - 1) +} + +// fill Dicts and DictKeys from analysis results +func (s *interim) prepareDicts() { + var pidNext int + + var totTFs int + var totLocs int + + visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + dict := s.Dicts[fieldID] + dictKeys := s.DictKeys[fieldID] + + for term, tf := range tfs { + pidPlus1, exists := dict[term] + if !exists { + pidNext++ + pidPlus1 = uint64(pidNext) + + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) + + s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) + s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) + } + + pid := pidPlus1 - 1 + + s.numTermsPerPostingsList[pid] += 1 + s.numLocsPerPostingsList[pid] += len(tf.Locations) + + totLocs += len(tf.Locations) + } + + totTFs += len(tfs) + + s.DictKeys[fieldID] = dictKeys + } + + for _, result := range s.results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + visitField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + visitField(fieldID, tf) + } + } + + numPostingsLists := pidNext + + if cap(s.Postings) >= numPostingsLists { + s.Postings = s.Postings[:numPostingsLists] + } else { + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, s.Postings[:cap(s.Postings)]) + for i := 0; i < numPostingsLists; i++ { + if postings[i] == nil { + postings[i] = roaring.New() + } + } + s.Postings = postings + } + + if cap(s.FreqNorms) >= numPostingsLists { + s.FreqNorms = s.FreqNorms[:numPostingsLists] + } else { + s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } + + if cap(s.freqNormsBacking) >= totTFs { + s.freqNormsBacking = s.freqNormsBacking[:totTFs] + } else { + s.freqNormsBacking = make([]interimFreqNorm, totTFs) + } + + freqNormsBacking := s.freqNormsBacking + for pid, numTerms := range s.numTermsPerPostingsList { + s.FreqNorms[pid] = freqNormsBacking[0:0] + freqNormsBacking = freqNormsBacking[numTerms:] + } + + if cap(s.Locs) >= numPostingsLists { + s.Locs = s.Locs[:numPostingsLists] + } else { + s.Locs = make([][]interimLoc, numPostingsLists) + } + + if cap(s.locsBacking) >= totLocs { + s.locsBacking = s.locsBacking[:totLocs] + } else { + s.locsBacking = make([]interimLoc, totLocs) + } + + locsBacking := s.locsBacking + for pid, numLocs := range s.numLocsPerPostingsList { + s.Locs[pid] = locsBacking[0:0] + locsBacking = locsBacking[numLocs:] + } +} + +func (s *interim) processDocuments() { + numFields := len(s.FieldsInv) + reuseFieldLens := make([]int, numFields) + reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) + + for docNum, result := range s.results { + for i := 0; i < numFields; i++ { // clear these for reuse + reuseFieldLens[i] = 0 + reuseFieldTFs[i] = nil + } + + s.processDocument(uint64(docNum), result, + reuseFieldLens, reuseFieldTFs) + } +} + +func (s *interim) processDocument(docNum uint64, + result *index.AnalysisResult, + fieldLens []int, fieldTFs []analysis.TokenFrequencies) { + visitField := func(fieldID uint16, fieldName string, + ln int, tf analysis.TokenFrequencies) { + fieldLens[fieldID] += ln + + existingFreqs := fieldTFs[fieldID] + if existingFreqs != nil { + existingFreqs.MergeAll(fieldName, tf) + } else { + fieldTFs[fieldID] = tf + } + } + + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln, tf := field.Analyze() + visitField(fieldID, field.Name(), ln, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln := result.Length[i] + tf := result.Analyzed[i] + visitField(fieldID, field.Name(), ln, tf) + } + + // now that it's been rolled up into fieldTFs, walk that + for fieldID, tfs := range fieldTFs { + dict := s.Dicts[fieldID] + norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) + + for term, tf := range tfs { + pid := dict[term] - 1 + bs := s.Postings[pid] + bs.Add(uint32(docNum)) + + s.FreqNorms[pid] = append(s.FreqNorms[pid], + interimFreqNorm{ + freq: uint64(tf.Frequency()), + norm: norm, + numLocs: len(tf.Locations), + }) + + if len(tf.Locations) > 0 { + locs := s.Locs[pid] + + for _, loc := range tf.Locations { + var locf = uint16(fieldID) + if loc.Field != "" { + locf = uint16(s.getOrDefineField(loc.Field)) + } + var arrayposs []uint64 + if len(loc.ArrayPositions) > 0 { + arrayposs = loc.ArrayPositions + } + locs = append(locs, interimLoc{ + fieldID: locf, + pos: uint64(loc.Position), + start: uint64(loc.Start), + end: uint64(loc.End), + arrayposs: arrayposs, + }) + } + + s.Locs[pid] = locs + } + } + } +} + +func (s *interim) writeStoredFields() ( + storedIndexOffset uint64, err error) { + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return s.metaBuf.Write(varBuf[:wb]) + } + + data, compressed := s.tmp0[:0], s.tmp1[:0] + defer func() { s.tmp0, s.tmp1 = data, compressed }() + + // keyed by docNum + docStoredOffsets := make([]uint64, len(s.results)) + + // keyed by fieldID, for the current doc in the loop + docStoredFields := map[uint16]interimStoredField{} + + for docNum, result := range s.results { + for fieldID := range docStoredFields { // reset for next doc + delete(docStoredFields, fieldID) + } + + for _, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + + opts := field.Options() + + if opts.IsStored() { + isf := docStoredFields[fieldID] + isf.vals = append(isf.vals, field.Value()) + isf.typs = append(isf.typs, encodeFieldType(field)) + isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) + docStoredFields[fieldID] = isf + } + + if opts.IncludeDocValues() { + s.IncludeDocValues[fieldID] = true + } + + err := ValidateDocFields(field) + if err != nil { + return 0, err + } + } + + var curr int + + s.metaBuf.Reset() + data = data[:0] + + // _id field special case optimizes ExternalID() lookups + idFieldVal := docStoredFields[uint16(0)].vals[0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, err + } + + // handle non-"_id" fields + for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { + isf, exists := docStoredFields[uint16(fieldID)] + if exists { + curr, data, err = persistStoredFieldValues( + fieldID, isf.vals, isf.typs, isf.arrayposs, + curr, metaEncode, data) + if err != nil { + return 0, err + } + } + } + + metaBytes := s.metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + docStoredOffsets[docNum] = uint64(s.w.Count()) + + _, err := writeUvarints(s.w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, err + } + + _, err = s.w.Write(metaBytes) + if err != nil { + return 0, err + } + + _, err = s.w.Write(idFieldVal) + if err != nil { + return 0, err + } + + _, err = s.w.Write(compressed) + if err != nil { + return 0, err + } + } + + storedIndexOffset = uint64(s.w.Count()) + + for _, docStoredOffset := range docStoredOffsets { + err = binary.Write(s.w, binary.BigEndian, docStoredOffset) + if err != nil { + return 0, err + } + } + + return storedIndexOffset, nil +} + +func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { + dictOffsets = make([]uint64, len(s.FieldsInv)) + + fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) + fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) + + buf := s.grabBuf(binary.MaxVarintLen64) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1)) + locEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1)) + + var docTermMap [][]byte + + if s.builder == nil { + s.builder, err = vellum.New(&s.builderBuf, nil) + if err != nil { + return 0, nil, err + } + } + + for fieldID, terms := range s.DictKeys { + if cap(docTermMap) < len(s.results) { + docTermMap = make([][]byte, len(s.results)) + } else { + docTermMap = docTermMap[0:len(s.results)] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + + dict := s.Dicts[fieldID] + + for _, term := range terms { // terms are already sorted + pid := dict[term] - 1 + + postingsBS := s.Postings[pid] + + freqNorms := s.FreqNorms[pid] + freqNormOffset := 0 + + locs := s.Locs[pid] + locOffset := 0 + + chunkSize, err := getChunkSize(s.chunkMode, postingsBS.GetCardinality(), uint64(len(s.results))) + if err != nil { + return 0, nil, err + } + tfEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1)) + locEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1)) + + postingsItr := postingsBS.Iterator() + for postingsItr.HasNext() { + docNum := uint64(postingsItr.Next()) + + freqNorm := freqNorms[freqNormOffset] + + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), + uint64(math.Float32bits(freqNorm.norm))) + if err != nil { + return 0, nil, err + } + + if freqNorm.numLocs > 0 { + numBytesLocs := 0 + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs)), loc.arrayposs) + } + + err = locEncoder.Add(docNum, uint64(numBytesLocs)) + if err != nil { + return 0, nil, err + } + + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs))) + if err != nil { + return 0, nil, err + } + + err = locEncoder.Add(docNum, loc.arrayposs...) + if err != nil { + return 0, nil, err + } + } + + locOffset += freqNorm.numLocs + } + + freqNormOffset++ + + docTermMap[docNum] = append( + append(docTermMap[docNum], term...), + termSeparator) + } + + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) + if err != nil { + return 0, nil, err + } + + if postingsOffset > uint64(0) { + err = s.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return 0, nil, err + } + } + + tfEncoder.Reset() + locEncoder.Reset() + } + + err = s.builder.Close() + if err != nil { + return 0, nil, err + } + + // record where this dictionary starts + dictOffsets[fieldID] = uint64(s.w.Count()) + + vellumData := s.builderBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + + // write this vellum to disk + _, err = s.w.Write(vellumData) + if err != nil { + return 0, nil, err + } + + // reset vellum for reuse + s.builderBuf.Reset() + + err = s.builder.Reset(&s.builderBuf) + if err != nil { + return 0, nil, err + } + + // write the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return 0, nil, err + } + fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(s.results)-1), s.w, false) + if s.IncludeDocValues[fieldID] { + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return 0, nil, err + } + } + } + err = fdvEncoder.Close() + if err != nil { + return 0, nil, err + } + + fdvOffsetsStart[fieldID] = uint64(s.w.Count()) + + _, err = fdvEncoder.Write() + if err != nil { + return 0, nil, err + } + + fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) + + fdvEncoder.Reset() + } else { + fdvOffsetsStart[fieldID] = fieldNotUninverted + fdvOffsetsEnd[fieldID] = fieldNotUninverted + } + } + + fdvIndexOffset = uint64(s.w.Count()) + + for i := 0; i < len(fdvOffsetsStart); i++ { + n := binary.PutUvarint(buf, fdvOffsetsStart[i]) + _, err := s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + } + + return fdvIndexOffset, dictOffsets, nil +} + +func encodeFieldType(f document.Field) byte { + fieldType := byte('x') + switch f.(type) { + case *document.TextField: + fieldType = 't' + case *document.NumericField: + fieldType = 'n' + case *document.DateTimeField: + fieldType = 'd' + case *document.BooleanField: + fieldType = 'b' + case *document.GeoPointField: + fieldType = 'g' + case *document.CompositeField: + fieldType = 'c' + } + return fieldType +} + +// returns the total # of bytes needed to encode the given uint64's +// into binary.PutUVarint() encoding +func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { + n = numUvarintBytes(a) + n += numUvarintBytes(b) + n += numUvarintBytes(c) + n += numUvarintBytes(d) + n += numUvarintBytes(e) + for _, v := range more { + n += numUvarintBytes(v) + } + return n +} + +// returns # of bytes needed to encode x in binary.PutUvarint() encoding +func numUvarintBytes(x uint64) (n int) { + for x >= 0x80 { + x >>= 7 + n++ + } + return n + 1 +} diff --git a/vendor/github.com/blevesearch/zap/v14/plugin.go b/vendor/github.com/blevesearch/zap/v14/plugin.go new file mode 100644 index 000000000..38a0638d4 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/plugin.go @@ -0,0 +1,37 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// ZapPlugin implements the Plugin interface of +// the blevesearch/bleve/index/scorch/segment pkg +type ZapPlugin struct{} + +func (*ZapPlugin) Type() string { + return Type +} + +func (*ZapPlugin) Version() uint32 { + return Version +} + +// Plugin returns an instance segment.Plugin for use +// by the Scorch indexing scheme +func Plugin() segment.Plugin { + return &ZapPlugin{} +} diff --git a/vendor/github.com/blevesearch/zap/v14/posting.go b/vendor/github.com/blevesearch/zap/v14/posting.go new file mode 100644 index 000000000..88b2b9539 --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/posting.go @@ -0,0 +1,796 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "math" + "reflect" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + +// FST or vellum value (uint64) encoding is determined by the top two +// highest-order or most significant bits... +// +// encoding : MSB +// name : 63 62 61...to...bit #0 (LSB) +// ----------+---+---+--------------------------------------------------- +// general : 0 | 0 | 62-bits of postingsOffset. +// ~ : 0 | 1 | reserved for future. +// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. +// ~ : 1 | 1 | reserved for future. +// +// Encoding "general" is able to handle all cases, where the +// postingsOffset points to more information about the postings for +// the term. +// +// Encoding "1-hit" is used to optimize a commonly seen case when a +// term has only a single hit. For example, a term in the _id field +// will have only 1 hit. The "1-hit" encoding is used for a term +// in a field when... +// +// - term vector info is disabled for that field; +// - and, the term appears in only a single doc for that field; +// - and, the term's freq is exactly 1 in that single doc for that field; +// - and, the docNum must fit into 31-bits; +// +// Otherwise, the "general" encoding is used instead. +// +// In the "1-hit" encoding, the field in that single doc may have +// other terms, which is supported in the "1-hit" encoding by the +// positive float31 norm. + +const FSTValEncodingMask = uint64(0xc000000000000000) +const FSTValEncodingGeneral = uint64(0x0000000000000000) +const FSTValEncoding1Hit = uint64(0x8000000000000000) + +func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { + return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) +} + +func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { + return (mask31Bits & v), (mask31Bits & (v >> 31)) +} + +const mask31Bits = uint64(0x000000007fffffff) + +func under32Bits(x uint64) bool { + return x <= mask31Bits +} + +const DocNum1HitFinished = math.MaxUint64 + +var NormBits1Hit = uint64(math.Float32bits(float32(1))) + +// PostingsList is an in-memory representation of a postings list +type PostingsList struct { + sb *SegmentBase + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + + // when normBits1Hit != 0, then this postings list came from a + // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply + docNum1Hit uint64 + normBits1Hit uint64 + + chunkSize uint64 +} + +// represents an immutable, empty postings list +var emptyPostingsList = &PostingsList{} + +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { + if p.normBits1Hit != 0 { + receiver.Add(uint32(p.docNum1Hit)) + return + } + + if p.postings != nil { + receiver.Or(p.postings) + } +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, + prealloc segment.PostingsIterator) segment.PostingsIterator { + if p.normBits1Hit == 0 && p.postings == nil { + return emptyPostingsIterator + } + + var preallocPI *PostingsIterator + pi, ok := prealloc.(*PostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyPostingsIterator { + preallocPI = nil + } + + return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) +} + +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, + rv *PostingsIterator) *PostingsIterator { + if rv == nil { + rv = &PostingsIterator{} + } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.reset() + } + + locReader := rv.locReader + if locReader != nil { + locReader.reset() + } + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf + } + + rv.postings = p + rv.includeFreqNorm = includeFreq || includeNorm || includeLocs + rv.includeLocs = includeLocs + + if p.normBits1Hit != 0 { + // "1-hit" encoding + rv.docNum1Hit = p.docNum1Hit + rv.normBits1Hit = p.normBits1Hit + + if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { + rv.docNum1Hit = DocNum1HitFinished + } + + return rv + } + + // "general" encoding, check if empty + if p.postings == nil { + return rv + } + + // initialize freq chunk reader + if rv.includeFreqNorm { + rv.freqNormReader = newChunkedIntDecoder(p.sb.mem, p.freqOffset, rv.freqNormReader) + } + + // initialize the loc chunk reader + if rv.includeLocs { + rv.locReader = newChunkedIntDecoder(p.sb.mem, p.locOffset, rv.locReader) + } + + rv.all = p.postings.Iterator() + if p.except != nil { + rv.ActualBM = roaring.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = p.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. + } + + return rv +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var n, e uint64 + if p.normBits1Hit != 0 { + n = 1 + if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) { + e = 1 + } + } else if p.postings != nil { + n = p.postings.GetCardinality() + if p.except != nil { + e = p.postings.AndCardinality(p.except) + } + } + return n - e +} + +func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { + rv.postingsOffset = postingsOffset + + // handle "1-hit" encoding special case + if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { + return rv.init1Hit(postingsOffset) + } + + // read the location of the freq/norm details + var n uint64 + var read int + + rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + + rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var postingsLen uint64 + postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] + + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } + _, err := rv.postings.FromBuffer(roaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + rv.chunkSize, err = getChunkSize(d.sb.chunkMode, + rv.postings.GetCardinality(), d.sb.numDocs) + if err != nil { + return err + } + + return nil +} + +func (rv *PostingsList) init1Hit(fstVal uint64) error { + docNum, normBits := FSTValDecode1Hit(fstVal) + + rv.docNum1Hit = docNum + rv.normBits1Hit = normBits + + return nil +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntPeekable + Actual roaring.IntPeekable + ActualBM *roaring.Bitmap + + currChunk uint32 + freqNormReader *chunkedIntDecoder + locReader *chunkedIntDecoder + + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls + nextSegmentLocs []segment.Location // reused across Next() calls + + docNum1Hit uint64 + normBits1Hit uint64 + + buf []byte + + includeFreqNorm bool + includeLocs bool +} + +var emptyPostingsIterator = &PostingsIterator{} + +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + i.next.Size() + // account for freqNormReader, locReader if we start using this. + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + if i.includeFreqNorm { + err := i.freqNormReader.loadChunk(chunk) + if err != nil { + return err + } + } + + if i.includeLocs { + err := i.locReader.loadChunk(chunk) + if err != nil { + return err + } + } + + i.currChunk = uint32(chunk) + return nil +} + +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { + if i.normBits1Hit != 0 { + return 1, i.normBits1Hit, false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) + } + + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + + normBits, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) + } + + return freq, normBits, hasLocs, nil +} + +func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) { + if i.normBits1Hit != 0 { + return false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return false, fmt.Errorf("error reading freqHasLocs: %v", err) + } + + i.freqNormReader.SkipUvarint() // Skip normBits. + + return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs. +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs +} + +// readLocation processes all the integers on the stream representing a single +// location. +func (i *PostingsIterator) readLocation(l *Location) error { + // read off field + fieldID, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } + // read off pos + pos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location pos: %v", err) + } + // read off start + start, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location start: %v", err) + } + // read off end + end, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } + // read off num array pos + numArrayPos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location num array pos: %v", err) + } + + l.field = i.postings.sb.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + + if cap(l.ap) < int(numArrayPos) { + l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:int(numArrayPos)] + } + + // read off array positions + for k := 0; k < int(numArrayPos); k++ { + ap, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading array position: %v", err) + } + + l.ap[k] = ap + } + + return nil +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() (segment.Posting, error) { + return i.nextAtOrAfter(0) +} + +// Advance returns the posting at the specified docNum or it is not present +// the next posting, or if the end is reached, nil +func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { + return i.nextAtOrAfter(docNum) +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { + docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err + } + + i.next = Posting{} // clear the struct + rv := &i.next + rv.docNum = docNum + + if !i.includeFreqNorm { + return rv, nil + } + + var normBits uint64 + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return nil, err + } + + rv.norm = math.Float32frombits(uint32(normBits)) + + if i.includeLocs && hasLocs { + // prepare locations into reused slices, where we assume + // rv.freq >= "number of locs", since in a composite field, + // some component fields might have their IncludeTermVector + // flags disabled while other component fields are enabled + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] + } else { + i.nextLocs = make([]Location, rv.freq, rv.freq*2) + } + if cap(i.nextSegmentLocs) < int(rv.freq) { + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) + } + rv.locs = i.nextSegmentLocs[:0] + + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + j := 0 + startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { + err := i.readLocation(&i.nextLocs[j]) + if err != nil { + return nil, err + } + rv.locs = append(rv.locs, &i.nextLocs[j]) + j++ + } + } + + return rv, nil +} + +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if i.normBits1Hit != 0 { + if i.docNum1Hit == DocNum1HitFinished { + return 0, false, nil + } + if i.docNum1Hit < atOrAfter { + // advanced past our 1-hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return 0, false, nil + } + docNum := i.docNum1Hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return docNum, true, nil + } + + if i.Actual == nil || !i.Actual.HasNext() { + return 0, false, nil + } + + if i.postings == nil || i.postings.postings == i.ActualBM { + return i.nextDocNumAtOrAfterClean(atOrAfter) + } + + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { + // couldn't find anything + return 0, false, nil + } + + n := i.Actual.Next() + allN := i.all.Next() + nChunk := n / uint32(i.postings.chunkSize) + + // when allN becomes >= to here, then allN is in the same chunk as nChunk. + allNReachesNChunk := nChunk * uint32(i.postings.chunkSize) + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do + for allN != n { + // we've reached same chunk, so move the freq/norm/loc decoders forward + if i.includeFreqNorm && allN >= allNReachesNChunk { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, err + } + } + + allN = i.all.Next() + } + + if i.includeFreqNorm && (i.currChunk != nChunk || i.freqNormReader.isNil()) { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +// optimization when the postings list is "clean" (e.g., no updates & +// no deletions) where the all bitmap is the same as the actual bitmap +func (i *PostingsIterator) nextDocNumAtOrAfterClean( + atOrAfter uint64) (uint64, bool, error) { + + if !i.includeFreqNorm { + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { + return 0, false, nil // couldn't find anything + } + + return uint64(i.Actual.Next()), true, nil + } + + // freq-norm's needed, so maintain freq-norm chunk reader + sameChunkNexts := 0 // # of times we called Next() in the same chunk + n := i.Actual.Next() + nChunk := n / uint32(i.postings.chunkSize) + + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + + nChunkPrev := nChunk + nChunk = n / uint32(i.postings.chunkSize) + + if nChunk != nChunkPrev { + sameChunkNexts = 0 + } else { + sameChunkNexts += 1 + } + } + + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) + } + } + + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +func (i *PostingsIterator) currChunkNext(nChunk uint32) error { + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + hasLocs, err := i.skipFreqNormReadHasLocs() + if err != nil { + return err + } + + if i.includeLocs && hasLocs { + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + i.locReader.SkipBytes(int(numLocsBytes)) + } + + return nil +} + +// DocNum1Hit returns the docNum and true if this is "1-hit" optimized +// and the docNum is available. +func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { + if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { + return p.docNum1Hit, true + } + return 0, false +} + +// ActualBitmap returns the underlying actual bitmap +// which can be used up the stack for optimizations +func (p *PostingsIterator) ActualBitmap() *roaring.Bitmap { + return p.ActualBM +} + +// ReplaceActual replaces the ActualBM with the provided +// bitmap +func (p *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) { + p.ActualBM = abm + p.Actual = abm.Iterator() +} + +// PostingsIteratorFromBitmap constructs a PostingsIterator given an +// "actual" bitmap. +func PostingsIteratorFromBitmap(bm *roaring.Bitmap, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + ActualBM: bm, + Actual: bm.Iterator(), + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// PostingsIteratorFrom1Hit constructs a PostingsIterator given a +// 1-hit docNum. +func PostingsIteratorFrom1Hit(docNum1Hit uint64, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + docNum1Hit: docNum1Hit, + normBits1Hit: NormBits1Hit, + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// Posting is a single entry in a postings list +type Posting struct { + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location +} + +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequencies of occurrence of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.freq +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(p.norm) +} + +// Locations returns the location information for each occurrence +func (p *Posting) Locations() []segment.Location { + return p.locs +} + +// Location represents the location of a single occurrence +type Location struct { + field string + pos uint64 + start uint64 + end uint64 + ap []uint64 +} + +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*size.SizeOfUint64 +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.field +} + +// Start returns the start byte offset of this occurrence +func (l *Location) Start() uint64 { + return l.start +} + +// End returns the end byte offset of this occurrence +func (l *Location) End() uint64 { + return l.end +} + +// Pos returns the 1-based phrase position of this occurrence +func (l *Location) Pos() uint64 { + return l.pos +} + +// ArrayPositions returns the array position vector associated with this occurrence +func (l *Location) ArrayPositions() []uint64 { + return l.ap +} diff --git a/vendor/github.com/blevesearch/zap/v14/read.go b/vendor/github.com/blevesearch/zap/v14/read.go new file mode 100644 index 000000000..e47d4c6ab --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/read.go @@ -0,0 +1,43 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import "encoding/binary" + +func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { + _, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum) + + meta := s.mem[storedOffset+n : storedOffset+n+metaLen] + data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen] + + return meta, data +} + +func (s *SegmentBase) getDocStoredOffsets(docNum uint64) ( + uint64, uint64, uint64, uint64, uint64) { + indexOffset := s.storedIndexOffset + (8 * docNum) + + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + + var n uint64 + + metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64]) + n += uint64(read) + + dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + return indexOffset, storedOffset, n, metaLen, dataLen +} diff --git a/vendor/github.com/blevesearch/zap/v14/segment.go b/vendor/github.com/blevesearch/zap/v14/segment.go new file mode 100644 index 000000000..e8b1f067f --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/segment.go @@ -0,0 +1,572 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "os" + "sync" + "unsafe" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" + "github.com/couchbase/vellum" + mmap "github.com/blevesearch/mmap-go" + "github.com/golang/snappy" +) + +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) +} + +// Open returns a zap impl of a segment +func (*ZapPlugin) Open(path string) (segment.Segment, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + mm, err := mmap.Map(f, mmap.RDONLY, 0) + if err != nil { + // mmap failed, try to close the file + _ = f.Close() + return nil, err + } + + rv := &Segment{ + SegmentBase: SegmentBase{ + mem: mm[0 : len(mm)-FooterSize], + fieldsMap: make(map[string]uint16), + fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), + }, + f: f, + mm: mm, + path: path, + refs: 1, + } + rv.SegmentBase.updateSize() + + err = rv.loadConfig() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadFields() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadDvReaders() + if err != nil { + _ = rv.Close() + return nil, err + } + + return rv, nil +} + +// SegmentBase is a memory only, read-only implementation of the +// segment.Segment interface, using zap's data representation. +type SegmentBase struct { + mem []byte + memCRC uint32 + chunkMode uint32 + fieldsMap map[string]uint16 // fieldName -> fieldID+1 + fieldsInv []string // fieldID -> fieldName + numDocs uint64 + storedIndexOffset uint64 + fieldsIndexOffset uint64 + docValueOffset uint64 + dictLocs []uint64 + fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field + fieldDvNames []string // field names cached in fieldDvReaders + size uint64 + + m sync.Mutex + fieldFSTs map[uint16]*vellum.FST +} + +func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { + sizeInBytes := reflectStaticSizeSegmentBase + + cap(sb.mem) + + // fieldsMap + for k := range sb.fieldsMap { + sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + size.SizeOfString + } + sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 + + // fieldDvReaders + for _, v := range sb.fieldDvReaders { + sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + + sb.size = uint64(sizeInBytes) +} + +func (sb *SegmentBase) AddRef() {} +func (sb *SegmentBase) DecRef() (err error) { return nil } +func (sb *SegmentBase) Close() (err error) { return nil } + +// Segment implements a persisted segment.Segment interface, by +// embedding an mmap()'ed SegmentBase. +type Segment struct { + SegmentBase + + f *os.File + mm mmap.MMap + path string + version uint32 + crc uint32 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (s *Segment) Size() int { + // 8 /* size of file pointer */ + // 4 /* size of version -> uint32 */ + // 4 /* size of crc -> uint32 */ + sizeOfUints := 16 + + sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints + + // mutex, refs -> int64 + sizeInBytes += 16 + + // do not include the mmap'ed part + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) +} + +func (s *Segment) AddRef() { + s.m.Lock() + s.refs++ + s.m.Unlock() +} + +func (s *Segment) DecRef() (err error) { + s.m.Lock() + s.refs-- + if s.refs == 0 { + err = s.closeActual() + } + s.m.Unlock() + return err +} + +func (s *Segment) loadConfig() error { + crcOffset := len(s.mm) - 4 + s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + + verOffset := crcOffset - 4 + s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) + if s.version != Version { + return fmt.Errorf("unsupported version %d", s.version) + } + + chunkOffset := verOffset - 4 + s.chunkMode = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) + + docValueOffset := chunkOffset - 8 + s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) + + fieldsIndexOffset := docValueOffset - 8 + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) + + storedIndexOffset := fieldsIndexOffset - 8 + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) + + numDocsOffset := storedIndexOffset - 8 + s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) + return nil +} + +func (s *SegmentBase) loadFields() error { + // NOTE for now we assume the fields index immediately precedes + // the footer, and if this changes, need to adjust accordingly (or + // store explicit length), where s.mem was sliced from s.mm in Open(). + fieldsIndexEnd := uint64(len(s.mem)) + + // iterate through fields index + var fieldID uint64 + for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) + + dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) + n := uint64(read) + s.dictLocs = append(s.dictLocs, dictLoc) + + var nameLen uint64 + nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(s.mem[addr+n : addr+n+nameLen]) + s.fieldsInv = append(s.fieldsInv, name) + s.fieldsMap[name] = uint16(fieldID + 1) + + fieldID++ + } + return nil +} + +// Dictionary returns the term dictionary for the specified field +func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { + dict, err := s.dictionary(field) + if err == nil && dict == nil { + return &segment.EmptyDictionary{}, nil + } + return dict, err +} + +func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 > 0 { + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } + + dictStart := sb.dictLocs[rv.fieldID] + if dictStart > 0 { + var ok bool + sb.m.Lock() + if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok { + // read the length of the vellum data + vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + rv.fst, err = vellum.Load(fstBytes) + if err != nil { + sb.m.Unlock() + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + + sb.fieldFSTs[rv.fieldID] = rv.fst + } + + sb.m.Unlock() + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } + + } + } + + return rv, nil +} + +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + return reuse + }, +} + +// VisitDocument invokes the DocFieldValueVistor for each stored field +// for the specified doc number +func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitDocument(vdc, num, visitor) +} + +func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, + visitor segment.DocumentFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < s.numDocs { + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + idFieldVal := compressed[:idFieldValLen] + + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + if err != nil { + return err + } + + for keepGoing { + field, err := binary.ReadUvarint(&vdc.reader) + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + offset, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + l, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + numap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] + for i := 0; i < int(numap); i++ { + ap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + arrayPos[i] = ap + } + } + + value := uncompressed[offset : offset+l] + keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) + } + + vdc.buf = uncompressed + } + return nil +} + +// DocID returns the value of the _id field for the given docNum +func (s *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= s.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + +// Count returns the number of documents in this segment. +func (s *SegmentBase) Count() uint64 { + return s.numDocs +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(s.fieldsMap) > 0 { + idDict, err := s.dictionary("_id") + if err != nil { + return nil, err + } + + postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + filteredIds := make([]string, 0, len(ids)) + for _, id := range ids { + if id <= sMaxStr { + filteredIds = append(filteredIds, id) + } + } + + for _, id := range filteredIds { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } + } + + return rv, nil +} + +// Fields returns the field names used in this segment +func (s *SegmentBase) Fields() []string { + return s.fieldsInv +} + +// Path returns the path of this segment on disk +func (s *Segment) Path() string { + return s.path +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() (err error) { + return s.DecRef() +} + +func (s *Segment) closeActual() (err error) { + if s.mm != nil { + err = s.mm.Unmap() + } + // try to close file even if unmap failed + if s.f != nil { + err2 := s.f.Close() + if err == nil { + // try to return first error + err = err2 + } + } + return +} + +// some helpers i started adding for the command-line utility + +// Data returns the underlying mmaped data slice +func (s *Segment) Data() []byte { + return s.mm +} + +// CRC returns the CRC value stored in the file footer +func (s *Segment) CRC() uint32 { + return s.crc +} + +// Version returns the file version in the file footer +func (s *Segment) Version() uint32 { + return s.version +} + +// ChunkFactor returns the chunk factor in the file footer +func (s *Segment) ChunkMode() uint32 { + return s.chunkMode +} + +// FieldsIndexOffset returns the fields index offset in the file footer +func (s *Segment) FieldsIndexOffset() uint64 { + return s.fieldsIndexOffset +} + +// StoredIndexOffset returns the stored value index offset in the file footer +func (s *Segment) StoredIndexOffset() uint64 { + return s.storedIndexOffset +} + +// DocValueOffset returns the docValue offset in the file footer +func (s *Segment) DocValueOffset() uint64 { + return s.docValueOffset +} + +// NumDocs returns the number of documents in the file footer +func (s *Segment) NumDocs() uint64 { + return s.numDocs +} + +// DictAddr is a helper function to compute the file offset where the +// dictionary is stored for the specified field. +func (s *Segment) DictAddr(field string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[field] + if !ok { + return 0, fmt.Errorf("no such field '%s'", field) + } + + return s.dictLocs[fieldIDPlus1-1], nil +} + +func (s *SegmentBase) loadDvReaders() error { + if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 { + return nil + } + + var read uint64 + for fieldID, field := range s.fieldsInv { + var fieldLocStart, fieldLocEnd uint64 + var n int + fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) + } + read += uint64(n) + fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + + fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + if err != nil { + return err + } + if fieldDvReader != nil { + s.fieldDvReaders[uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, field) + } + } + + return nil +} diff --git a/vendor/github.com/blevesearch/zap/v14/write.go b/vendor/github.com/blevesearch/zap/v14/write.go new file mode 100644 index 000000000..77aefdbfc --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/write.go @@ -0,0 +1,145 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "io" + + "github.com/RoaringBitmap/roaring" +) + +// writes out the length of the roaring bitmap in bytes as varint +// then writes out the roaring bitmap itself +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, + reuseBufVarint []byte) (int, error) { + buf, err := r.ToBytes() + if err != nil { + return 0, err + } + + var tw int + + // write out the length + n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) + nw, err := w.Write(reuseBufVarint[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the roaring bytes + nw, err = w.Write(buf) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { + var rv uint64 + var fieldsOffsets []uint64 + + for fieldID, fieldName := range fieldsInv { + // record start of this field + fieldsOffsets = append(fieldsOffsets, uint64(w.Count())) + + // write out the dict location and field name length + _, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName))) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + } + + // now write out the fields index + rv = uint64(w.Count()) + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} + +// FooterSize is the size of the footer record in bytes +// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset +const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 + +func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + chunkMode uint32, crcBeforeFooter uint32, writerIn io.Writer) error { + w := NewCountHashWriter(writerIn) + w.crc = crcBeforeFooter + + // write out the number of docs + err := binary.Write(w, binary.BigEndian, numDocs) + if err != nil { + return err + } + // write out the stored field index location: + err = binary.Write(w, binary.BigEndian, storedIndexOffset) + if err != nil { + return err + } + // write out the field index location + err = binary.Write(w, binary.BigEndian, fieldsIndexOffset) + if err != nil { + return err + } + // write out the fieldDocValue location + err = binary.Write(w, binary.BigEndian, docValueOffset) + if err != nil { + return err + } + // write out 32-bit chunk factor + err = binary.Write(w, binary.BigEndian, chunkMode) + if err != nil { + return err + } + // write out 32-bit version + err = binary.Write(w, binary.BigEndian, Version) + if err != nil { + return err + } + // write out CRC-32 of everything upto but not including this CRC + err = binary.Write(w, binary.BigEndian, w.crc) + if err != nil { + return err + } + return nil +} + +func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) { + buf := make([]byte, binary.MaxVarintLen64) + for _, val := range vals { + n := binary.PutUvarint(buf, val) + var nw int + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + return tw, err +} diff --git a/vendor/github.com/blevesearch/zap/v14/zap.md b/vendor/github.com/blevesearch/zap/v14/zap.md new file mode 100644 index 000000000..d74dc548b --- /dev/null +++ b/vendor/github.com/blevesearch/zap/v14/zap.md @@ -0,0 +1,177 @@ +# ZAP File Format + +## Legend + +### Sections + + |========| + | | section + |========| + +### Fixed-size fields + + |--------| |----| |--| |-| + | | uint64 | | uint32 | | uint16 | | uint8 + |--------| |----| |--| |-| + +### Varints + + |~~~~~~~~| + | | varint(up to uint64) + |~~~~~~~~| + +### Arbitrary-length fields + + |--------...---| + | | arbitrary-length field (string, vellum, roaring bitmap) + |--------...---| + +### Chunked data + + [--------] + [ ] + [--------] + +## Overview + +Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. + + |==================================================| + | Stored Fields | + |==================================================| + |-----> | Stored Fields Index | + | |==================================================| + | | Dictionaries + Postings + DocValues | + | |==================================================| + | |---> | DocValues Index | + | | |==================================================| + | | | Fields | + | | |==================================================| + | | |-> | Fields Index | + | | | |========|========|========|========|====|====|====| + | | | | D# | SF | F | FDV | CF | V | CC | (Footer) + | | | |========|====|===|====|===|====|===|====|====|====| + | | | | | | + |-+-+-----------------| | | + | |--------------------------| | + |-------------------------------------| + + D#. Number of Docs. + SF. Stored Fields Index Offset. + F. Field Index Offset. + FDV. Field DocValue Offset. + CF. Chunk Factor. + V. Version. + CC. CRC32. + +## Stored Fields + +Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. + + 0 [SF] [SF + D# * 8] + | Stored Fields | Stored Fields Index | + |================================|==================================| + | | | + | |--------------------| ||--------|--------|. . .|--------|| + | |-> | Stored Fields Data | || 0 | 1 | | D# - 1 || + | | |--------------------| ||--------|----|---|. . .|--------|| + | | | | | + |===|============================|==============|===================| + | | + |-------------------------------------------| + +Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data. + + Stored Fields Data + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + | MDS | CDS | MD | CD | + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + + MDS. Metadata size. + CDS. Compressed data size. + MD. Metadata. + CD. Snappy-compressed data. + +## Fields + +Fields Index section located between addresses `F` and `len(file) - len(footer)` and consist of `uint64` values (`F1`, `F2`, ...) which are offsets to records in Fields section. We have `F# = (len(file) - len(footer) - F) / sizeof(uint64)` fields. + + + (...) [F] [F + F#] + | Fields | Fields Index. | + |================================|================================| + | | | + | |~~~~~~~~|~~~~~~~~|---...---|||--------|--------|...|--------|| + ||->| Dict | Length | Name ||| 0 | 1 | | F# - 1 || + || |~~~~~~~~|~~~~~~~~|---...---|||--------|----|---|...|--------|| + || | | | + ||===============================|==============|=================| + | | + |----------------------------------------------| + + +## Dictionaries + Postings + +Each of fields has its own dictionary, encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. + + |================================================================|- Dictionaries + + | | Postings + + | | DocValues + | Freq/Norm (chunked) | + | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | |->[ Freq | Norm (float32 under varint) ] | + | | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | | | + | |------------------------------------------------------------| | + | Location Details (chunked) | | + | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | + | |->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | | + | | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | + | | | | + | |----------------------| | | + | Postings List | | | + | |~~~~~~~~|~~~~~|~~|~~~~~~~~|-----------...--| | | + | |->| F/N | LD | Length | ROARING BITMAP | | | + | | |~~~~~|~~|~~~~~~~~|~~~~~~~~|-----------...--| | | + | | |----------------------------------------------| | + | |--------------------------------------| | + | Dictionary | | + | |~~~~~~~~|--------------------------|-...-| | + | |->| Length | VELLUM DATA : (TERM -> OFFSET) | | + | | |~~~~~~~~|----------------------------...-| | + | | | + |======|=========================================================|- DocValues Index + | | | + |======|=========================================================|- Fields + | | | + | |~~~~|~~~|~~~~~~~~|---...---| | + | | Dict | Length | Name | | + | |~~~~~~~~|~~~~~~~~|---...---| | + | | + |================================================================| + +## DocValues + +DocValues Index is `F#` pairs of varints, one pair per field. Each pair of varints indicates start and end point of DocValues slice. + + |================================================================| + | |------...--| | + | |->| DocValues |<-| | + | | |------...--| | | + |==|=================|===========================================|- DocValues Index + ||~|~~~~~~~~~|~~~~~~~|~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| + || DV1 START | DV1 STOP | . . . . . | DV(F#) START | DV(F#) END || + ||~~~~~~~~~~~|~~~~~~~~~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| + |================================================================| + +DocValues is chunked Snappy-compressed values for each document and field. + + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + [ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ] + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + +Last 16 bytes are description of chunks. + + |~~~~~~~~~~~~...~|----------------|----------------| + | Chunk Sizes | Chunk Size Arr | Chunk# | + |~~~~~~~~~~~~...~|----------------|----------------| diff --git a/vendor/github.com/couchbase/vellum/README.md b/vendor/github.com/couchbase/vellum/README.md index 907b8b3a8..e5c4a8bce 100644 --- a/vendor/github.com/couchbase/vellum/README.md +++ b/vendor/github.com/couchbase/vellum/README.md @@ -1,6 +1,6 @@ # ![vellum](docs/logo.png) vellum -[![Build Status](https://travis-ci.org/couchbase/vellum.svg?branch=master)](https://travis-ci.org/couchbase/vellum) +[![Tests](https://github.com/couchbase/vellum/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/couchbase/vellum/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) [![Coverage Status](https://coveralls.io/repos/github/couchbase/vellum/badge.svg?branch=master)](https://coveralls.io/github/couchbase/vellum?branch=master) [![GoDoc](https://godoc.org/github.com/couchbase/vellum?status.svg)](https://godoc.org/github.com/couchbase/vellum) [![Go Report Card](https://goreportcard.com/badge/github.com/couchbase/vellum)](https://goreportcard.com/report/github.com/couchbase/vellum) diff --git a/vendor/github.com/couchbase/vellum/fst_iterator.go b/vendor/github.com/couchbase/vellum/fst_iterator.go index d04ad63fb..2c6b0d68e 100644 --- a/vendor/github.com/couchbase/vellum/fst_iterator.go +++ b/vendor/github.com/couchbase/vellum/fst_iterator.go @@ -265,7 +265,7 @@ OUTER: // going back all the way to the OUTER loop var popNum int for j := len(i.statesStack) - 1; j > 0; j-- { - if i.statesStack[j].NumTransitions() != 1 { + if j == 1 || i.statesStack[j].NumTransitions() != 1 { popNum = len(i.statesStack) - 1 - j break } diff --git a/vendor/go.etcd.io/bbolt/README.md b/vendor/go.etcd.io/bbolt/README.md index 2dff3761d..c9e64b1a6 100644 --- a/vendor/go.etcd.io/bbolt/README.md +++ b/vendor/go.etcd.io/bbolt/README.md @@ -152,11 +152,12 @@ are not thread safe. To work with data in multiple goroutines you must start a transaction for each one or use locking to ensure only one goroutine accesses a transaction at a time. Creating transaction from the `DB` is thread safe. -Read-only transactions and read-write transactions should not depend on one -another and generally shouldn't be opened simultaneously in the same goroutine. -This can cause a deadlock as the read-write transaction needs to periodically -re-map the data file but it cannot do so while a read-only transaction is open. - +Transactions should not depend on one another and generally shouldn't be opened +simultaneously in the same goroutine. This can cause a deadlock as the read-write +transaction needs to periodically re-map the data file but it cannot do so while +any read-only transaction is open. Even a nested read-only transaction can cause +a deadlock, as the child transaction can block the parent transaction from releasing +its resources. #### Read-write transactions diff --git a/vendor/go.etcd.io/bbolt/freelist.go b/vendor/go.etcd.io/bbolt/freelist.go index d441b6925..697a46968 100644 --- a/vendor/go.etcd.io/bbolt/freelist.go +++ b/vendor/go.etcd.io/bbolt/freelist.go @@ -2,7 +2,6 @@ package bbolt import ( "fmt" - "reflect" "sort" "unsafe" ) @@ -94,24 +93,8 @@ func (f *freelist) pending_count() int { return count } -// copyallunsafe copies a list of all free ids and all pending ids in one sorted list. +// copyall copies a list of all free ids and all pending ids in one sorted list. // f.count returns the minimum length required for dst. -func (f *freelist) copyallunsafe(dstptr unsafe.Pointer) { // dstptr is []pgid data pointer - m := make(pgids, 0, f.pending_count()) - for _, txp := range f.pending { - m = append(m, txp.ids...) - } - sort.Sort(m) - fpgids := f.getFreePageIDs() - sz := len(fpgids) + len(m) - dst := *(*[]pgid)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(dstptr), - Len: sz, - Cap: sz, - })) - mergepgids(dst, fpgids, m) -} - func (f *freelist) copyall(dst []pgid) { m := make(pgids, 0, f.pending_count()) for _, txp := range f.pending { @@ -284,21 +267,23 @@ func (f *freelist) read(p *page) { } // If the page.count is at the max uint16 value (64k) then it's considered // an overflow and the size of the freelist is stored as the first element. - var idx, count uintptr = 0, uintptr(p.count) + var idx, count = 0, int(p.count) if count == 0xFFFF { idx = 1 - count = uintptr(*(*pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p)))) + c := *(*pgid)(unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) + count = int(c) + if count < 0 { + panic(fmt.Sprintf("leading element count %d overflows int", c)) + } } // Copy the list of page ids from the freelist. if count == 0 { f.ids = nil } else { - ids := *(*[]pgid)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + idx*unsafe.Sizeof(pgid(0)), - Len: int(count), - Cap: int(count), - })) + var ids []pgid + data := unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), unsafe.Sizeof(ids[0]), idx) + unsafeSlice(unsafe.Pointer(&ids), data, count) // copy the ids, so we don't modify on the freelist page directly idsCopy := make([]pgid, count) @@ -331,16 +316,22 @@ func (f *freelist) write(p *page) error { // The page.count can only hold up to 64k elements so if we overflow that // number then we handle it by putting the size in the first element. - lenids := f.count() - if lenids == 0 { - p.count = uint16(lenids) - } else if lenids < 0xFFFF { - p.count = uint16(lenids) - f.copyallunsafe(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p))) + l := f.count() + if l == 0 { + p.count = uint16(l) + } else if l < 0xFFFF { + p.count = uint16(l) + var ids []pgid + data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + unsafeSlice(unsafe.Pointer(&ids), data, l) + f.copyall(ids) } else { p.count = 0xFFFF - *(*pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p))) = pgid(lenids) - f.copyallunsafe(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + unsafe.Sizeof(pgid(0)))) + var ids []pgid + data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + unsafeSlice(unsafe.Pointer(&ids), data, l+1) + ids[0] = pgid(l) + f.copyall(ids[1:]) } return nil diff --git a/vendor/go.etcd.io/bbolt/node.go b/vendor/go.etcd.io/bbolt/node.go index 1690eef3f..73988b5c4 100644 --- a/vendor/go.etcd.io/bbolt/node.go +++ b/vendor/go.etcd.io/bbolt/node.go @@ -3,7 +3,6 @@ package bbolt import ( "bytes" "fmt" - "reflect" "sort" "unsafe" ) @@ -208,36 +207,32 @@ func (n *node) write(p *page) { } // Loop over each item and write it to the page. - bp := uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes)) + // off tracks the offset into p of the start of the next data. + off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes)) for i, item := range n.inodes { _assert(len(item.key) > 0, "write: zero-length inode key") + // Create a slice to write into of needed size and advance + // byte pointer for next iteration. + sz := len(item.key) + len(item.value) + b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz) + off += uintptr(sz) + // Write the page element. if n.isLeaf { elem := p.leafPageElement(uint16(i)) - elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem))) + elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) elem.flags = item.flags elem.ksize = uint32(len(item.key)) elem.vsize = uint32(len(item.value)) } else { elem := p.branchPageElement(uint16(i)) - elem.pos = uint32(bp - uintptr(unsafe.Pointer(elem))) + elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) elem.ksize = uint32(len(item.key)) elem.pgid = item.pgid _assert(elem.pgid != p.id, "write: circular dependency occurred") } - // Create a slice to write into of needed size and advance - // byte pointer for next iteration. - klen, vlen := len(item.key), len(item.value) - sz := klen + vlen - b := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: bp, - Len: sz, - Cap: sz, - })) - bp += uintptr(sz) - // Write data for the element to the end of the page. l := copy(b, item.key) copy(b[l:], item.value) diff --git a/vendor/go.etcd.io/bbolt/page.go b/vendor/go.etcd.io/bbolt/page.go index b5c169978..c9a158fb0 100644 --- a/vendor/go.etcd.io/bbolt/page.go +++ b/vendor/go.etcd.io/bbolt/page.go @@ -3,7 +3,6 @@ package bbolt import ( "fmt" "os" - "reflect" "sort" "unsafe" ) @@ -51,13 +50,13 @@ func (p *page) typ() string { // meta returns a pointer to the metadata section of the page. func (p *page) meta() *meta { - return (*meta)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p))) + return (*meta)(unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) } // leafPageElement retrieves the leaf node by index func (p *page) leafPageElement(index uint16) *leafPageElement { - off := uintptr(index) * unsafe.Sizeof(leafPageElement{}) - return (*leafPageElement)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + off)) + return (*leafPageElement)(unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), + leafPageElementSize, int(index))) } // leafPageElements retrieves a list of leaf nodes. @@ -65,17 +64,16 @@ func (p *page) leafPageElements() []leafPageElement { if p.count == 0 { return nil } - return *(*[]leafPageElement)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p), - Len: int(p.count), - Cap: int(p.count), - })) + var elems []leafPageElement + data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + unsafeSlice(unsafe.Pointer(&elems), data, int(p.count)) + return elems } // branchPageElement retrieves the branch node by index func (p *page) branchPageElement(index uint16) *branchPageElement { - off := uintptr(index) * unsafe.Sizeof(branchPageElement{}) - return (*branchPageElement)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p) + off)) + return (*branchPageElement)(unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), + unsafe.Sizeof(branchPageElement{}), int(index))) } // branchPageElements retrieves a list of branch nodes. @@ -83,20 +81,15 @@ func (p *page) branchPageElements() []branchPageElement { if p.count == 0 { return nil } - return *(*[]branchPageElement)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(p)) + unsafe.Sizeof(*p), - Len: int(p.count), - Cap: int(p.count), - })) + var elems []branchPageElement + data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + unsafeSlice(unsafe.Pointer(&elems), data, int(p.count)) + return elems } // dump writes n bytes of the page to STDERR as hex output. func (p *page) hexdump(n int) { - buf := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(p)), - Len: n, - Cap: n, - })) + buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, n) fmt.Fprintf(os.Stderr, "%x\n", buf) } @@ -115,11 +108,7 @@ type branchPageElement struct { // key returns a byte slice of the node key. func (n *branchPageElement) key() []byte { - return *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(n)) + uintptr(n.pos), - Len: int(n.ksize), - Cap: int(n.ksize), - })) + return unsafeByteSlice(unsafe.Pointer(n), 0, int(n.pos), int(n.pos)+int(n.ksize)) } // leafPageElement represents a node on a leaf page. @@ -132,20 +121,16 @@ type leafPageElement struct { // key returns a byte slice of the node key. func (n *leafPageElement) key() []byte { - return *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(n)) + uintptr(n.pos), - Len: int(n.ksize), - Cap: int(n.ksize), - })) + i := int(n.pos) + j := i + int(n.ksize) + return unsafeByteSlice(unsafe.Pointer(n), 0, i, j) } // value returns a byte slice of the node value. func (n *leafPageElement) value() []byte { - return *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(n)) + uintptr(n.pos) + uintptr(n.ksize), - Len: int(n.vsize), - Cap: int(n.vsize), - })) + i := int(n.pos) + int(n.ksize) + j := i + int(n.vsize) + return unsafeByteSlice(unsafe.Pointer(n), 0, i, j) } // PageInfo represents human readable information about a page. diff --git a/vendor/go.etcd.io/bbolt/tx.go b/vendor/go.etcd.io/bbolt/tx.go index 13937cdbf..4b1a64a8b 100644 --- a/vendor/go.etcd.io/bbolt/tx.go +++ b/vendor/go.etcd.io/bbolt/tx.go @@ -4,7 +4,6 @@ import ( "fmt" "io" "os" - "reflect" "sort" "strings" "time" @@ -524,24 +523,18 @@ func (tx *Tx) write() error { // Write pages to disk in order. for _, p := range pages { - size := (int(p.overflow) + 1) * tx.db.pageSize + rem := (uint64(p.overflow) + 1) * uint64(tx.db.pageSize) offset := int64(p.id) * int64(tx.db.pageSize) + var written uintptr // Write out page in "max allocation" sized chunks. - ptr := uintptr(unsafe.Pointer(p)) for { - // Limit our write to our max allocation size. - sz := size + sz := rem if sz > maxAllocSize-1 { sz = maxAllocSize - 1 } + buf := unsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz)) - // Write chunk to disk. - buf := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: ptr, - Len: sz, - Cap: sz, - })) if _, err := tx.db.ops.writeAt(buf, offset); err != nil { return err } @@ -550,14 +543,14 @@ func (tx *Tx) write() error { tx.stats.Write++ // Exit inner for loop if we've written all the chunks. - size -= sz - if size == 0 { + rem -= sz + if rem == 0 { break } // Otherwise move offset forward and move pointer to next chunk. offset += int64(sz) - ptr += uintptr(sz) + written += uintptr(sz) } } @@ -576,11 +569,7 @@ func (tx *Tx) write() error { continue } - buf := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ - Data: uintptr(unsafe.Pointer(p)), - Len: tx.db.pageSize, - Cap: tx.db.pageSize, - })) + buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize) // See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1 for i := range buf { diff --git a/vendor/go.etcd.io/bbolt/unsafe.go b/vendor/go.etcd.io/bbolt/unsafe.go new file mode 100644 index 000000000..c0e503750 --- /dev/null +++ b/vendor/go.etcd.io/bbolt/unsafe.go @@ -0,0 +1,39 @@ +package bbolt + +import ( + "reflect" + "unsafe" +) + +func unsafeAdd(base unsafe.Pointer, offset uintptr) unsafe.Pointer { + return unsafe.Pointer(uintptr(base) + offset) +} + +func unsafeIndex(base unsafe.Pointer, offset uintptr, elemsz uintptr, n int) unsafe.Pointer { + return unsafe.Pointer(uintptr(base) + offset + uintptr(n)*elemsz) +} + +func unsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte { + // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices + // + // This memory is not allocated from C, but it is unmanaged by Go's + // garbage collector and should behave similarly, and the compiler + // should produce similar code. Note that this conversion allows a + // subslice to begin after the base address, with an optional offset, + // while the URL above does not cover this case and only slices from + // index 0. However, the wiki never says that the address must be to + // the beginning of a C allocation (or even that malloc was used at + // all), so this is believed to be correct. + return (*[maxAllocSize]byte)(unsafeAdd(base, offset))[i:j:j] +} + +// unsafeSlice modifies the data, len, and cap of a slice variable pointed to by +// the slice parameter. This helper should be used over other direct +// manipulation of reflect.SliceHeader to prevent misuse, namely, converting +// from reflect.SliceHeader to a Go slice type. +func unsafeSlice(slice, data unsafe.Pointer, len int) { + s := (*reflect.SliceHeader)(slice) + s.Data = uintptr(data) + s.Cap = len + s.Len = len +} diff --git a/vendor/modules.txt b/vendor/modules.txt index fc4448bbf..3a09a263b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -59,7 +59,6 @@ github.com/PuerkitoBio/purell # github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 github.com/PuerkitoBio/urlesc # github.com/RoaringBitmap/roaring v0.4.23 -## explicit github.com/RoaringBitmap/roaring # github.com/alecthomas/chroma v0.8.0 ## explicit @@ -106,7 +105,7 @@ github.com/asaskevich/govalidator github.com/aymerick/douceur/css # github.com/beorn7/perks v1.0.1 github.com/beorn7/perks/quantile -# github.com/blevesearch/bleve v1.0.7 +# github.com/blevesearch/bleve v1.0.10 ## explicit github.com/blevesearch/bleve github.com/blevesearch/bleve/analysis @@ -156,10 +155,14 @@ github.com/blevesearch/segment # github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/snowballstem github.com/blevesearch/snowballstem/english -# github.com/blevesearch/zap/v11 v11.0.7 +# github.com/blevesearch/zap/v11 v11.0.10 github.com/blevesearch/zap/v11 -# github.com/blevesearch/zap/v12 v12.0.7 +# github.com/blevesearch/zap/v12 v12.0.10 github.com/blevesearch/zap/v12 +# github.com/blevesearch/zap/v13 v13.0.2 +github.com/blevesearch/zap/v13 +# github.com/blevesearch/zap/v14 v14.0.1 +github.com/blevesearch/zap/v14 # github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc github.com/boombuler/barcode github.com/boombuler/barcode/qr @@ -175,7 +178,7 @@ github.com/couchbase/gomemcached/client # github.com/couchbase/goutils v0.0.0-20191018232750-b49639060d85 github.com/couchbase/goutils/logging github.com/couchbase/goutils/scramsha -# github.com/couchbase/vellum v1.0.1 +# github.com/couchbase/vellum v1.0.2 github.com/couchbase/vellum github.com/couchbase/vellum/levenshtein github.com/couchbase/vellum/regexp @@ -755,7 +758,7 @@ github.com/yuin/goldmark-highlighting # github.com/yuin/goldmark-meta v0.0.0-20191126180153-f0638e958b60 ## explicit github.com/yuin/goldmark-meta -# go.etcd.io/bbolt v1.3.4 +# go.etcd.io/bbolt v1.3.5 go.etcd.io/bbolt # go.mongodb.org/mongo-driver v1.3.5 go.mongodb.org/mongo-driver/bson