Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revisionrelease/v1.8
parent
11e316654e
commit
a380cfd8e0
@ -1,22 +0,0 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Stephen Merity
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
@ -1,229 +0,0 @@
|
||||
package govarint
|
||||
|
||||
import "encoding/binary"
|
||||
import "io"
|
||||
|
||||
type U32VarintEncoder interface {
|
||||
PutU32(x uint32) int
|
||||
Close()
|
||||
}
|
||||
|
||||
type U32VarintDecoder interface {
|
||||
GetU32() (uint32, error)
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type U64VarintEncoder interface {
|
||||
PutU64(x uint64) int
|
||||
Close()
|
||||
}
|
||||
|
||||
type U64VarintDecoder interface {
|
||||
GetU64() (uint64, error)
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type U32GroupVarintEncoder struct {
|
||||
w io.Writer
|
||||
index int
|
||||
store [4]uint32
|
||||
temp [17]byte
|
||||
}
|
||||
|
||||
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} }
|
||||
|
||||
func (b *U32GroupVarintEncoder) Flush() (int, error) {
|
||||
// TODO: Is it more efficient to have a tailored version that's called only in Close()?
|
||||
// If index is zero, there are no integers to flush
|
||||
if b.index == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
// In the case we're flushing (the group isn't of size four), the non-values should be zero
|
||||
// This ensures the unused entries are all zero in the sizeByte
|
||||
for i := b.index; i < 4; i++ {
|
||||
b.store[i] = 0
|
||||
}
|
||||
length := 1
|
||||
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it
|
||||
b.temp[0] = 0
|
||||
for i, x := range b.store {
|
||||
size := byte(0)
|
||||
shifts := []byte{24, 16, 8, 0}
|
||||
for _, shift := range shifts {
|
||||
// Always writes at least one byte -- the first one (shift = 0)
|
||||
// Will write more bytes until the rest of the integer is all zeroes
|
||||
if (x>>shift) != 0 || shift == 0 {
|
||||
size += 1
|
||||
b.temp[length] = byte(x >> shift)
|
||||
length += 1
|
||||
}
|
||||
}
|
||||
// We store the size in two of the eight bits in the first byte (sizeByte)
|
||||
// 0 means there is one byte in total, hence why we subtract one from size
|
||||
b.temp[0] |= (size - 1) << (uint8(3-i) * 2)
|
||||
}
|
||||
// If we're flushing without a full group of four, remove the unused bytes we computed
|
||||
// This enables us to realize it's a partial group on decoding thanks to EOF
|
||||
if b.index != 4 {
|
||||
length -= 4 - b.index
|
||||
}
|
||||
_, err := b.w.Write(b.temp[:length])
|
||||
return length, err
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) {
|
||||
bytesWritten := 0
|
||||
b.store[b.index] = x
|
||||
b.index += 1
|
||||
if b.index == 4 {
|
||||
n, err := b.Flush()
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
bytesWritten += n
|
||||
b.index = 0
|
||||
}
|
||||
return bytesWritten, nil
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintEncoder) Close() {
|
||||
// On Close, we flush any remaining values that might not have been in a full group
|
||||
b.Flush()
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type U32GroupVarintDecoder struct {
|
||||
r io.ByteReader
|
||||
group [4]uint32
|
||||
pos int
|
||||
finished bool
|
||||
capacity int
|
||||
}
|
||||
|
||||
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder {
|
||||
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4}
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintDecoder) getGroup() error {
|
||||
// We should always receive a sizeByte if there are more values to read
|
||||
sizeByte, err := b.r.ReadByte()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Calculate the size of the four incoming 32 bit integers
|
||||
// 0b00 means 1 byte to read, 0b01 = 2, etc
|
||||
b.group[0] = uint32((sizeByte >> 6) & 3)
|
||||
b.group[1] = uint32((sizeByte >> 4) & 3)
|
||||
b.group[2] = uint32((sizeByte >> 2) & 3)
|
||||
b.group[3] = uint32(sizeByte & 3)
|
||||
//
|
||||
for index, size := range b.group {
|
||||
b.group[index] = 0
|
||||
// Any error that occurs in earlier byte reads should be repeated at the end one
|
||||
// Hence we only catch and report the final ReadByte's error
|
||||
var err error
|
||||
switch size {
|
||||
case 0:
|
||||
var x byte
|
||||
x, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)
|
||||
case 1:
|
||||
var x, y byte
|
||||
x, _ = b.r.ReadByte()
|
||||
y, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)<<8 | uint32(y)
|
||||
case 2:
|
||||
var x, y, z byte
|
||||
x, _ = b.r.ReadByte()
|
||||
y, _ = b.r.ReadByte()
|
||||
z, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z)
|
||||
case 3:
|
||||
var x, y, z, zz byte
|
||||
x, _ = b.r.ReadByte()
|
||||
y, _ = b.r.ReadByte()
|
||||
z, _ = b.r.ReadByte()
|
||||
zz, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz)
|
||||
}
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
// If we hit EOF here, we have found a partial group
|
||||
// We've return any valid entries we have read and return EOF once we run out
|
||||
b.capacity = index
|
||||
b.finished = true
|
||||
break
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reset the pos pointer to the beginning of the read values
|
||||
b.pos = 0
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) {
|
||||
// Check if we have any more values to give out - if not, let's get them
|
||||
if b.pos == b.capacity {
|
||||
// If finished is set, there is nothing else to do
|
||||
if b.finished {
|
||||
return 0, io.EOF
|
||||
}
|
||||
err := b.getGroup()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
// Increment pointer and return the value stored at that point
|
||||
b.pos += 1
|
||||
return b.group[b.pos-1], nil
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type Base128Encoder struct {
|
||||
w io.Writer
|
||||
tmpBytes []byte
|
||||
}
|
||||
|
||||
func NewU32Base128Encoder(w io.Writer) *Base128Encoder {
|
||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)}
|
||||
}
|
||||
func NewU64Base128Encoder(w io.Writer) *Base128Encoder {
|
||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)}
|
||||
}
|
||||
|
||||
func (b *Base128Encoder) PutU32(x uint32) (int, error) {
|
||||
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x))
|
||||
return b.w.Write(b.tmpBytes[:writtenBytes])
|
||||
}
|
||||
|
||||
func (b *Base128Encoder) PutU64(x uint64) (int, error) {
|
||||
writtenBytes := binary.PutUvarint(b.tmpBytes, x)
|
||||
return b.w.Write(b.tmpBytes[:writtenBytes])
|
||||
}
|
||||
|
||||
func (b *Base128Encoder) Close() {
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type Base128Decoder struct {
|
||||
r io.ByteReader
|
||||
}
|
||||
|
||||
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
|
||||
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
|
||||
|
||||
func (b *Base128Decoder) GetU32() (uint32, error) {
|
||||
v, err := binary.ReadUvarint(b.r)
|
||||
return uint32(v), err
|
||||
}
|
||||
|
||||
func (b *Base128Decoder) GetU64() (uint64, error) {
|
||||
return binary.ReadUvarint(b.r)
|
||||
}
|
@ -0,0 +1,174 @@
|
||||
// The code here was obtained from:
|
||||
// https://github.com/mmcloughlin/geohash
|
||||
|
||||
// The MIT License (MIT)
|
||||
// Copyright (c) 2015 Michael McLoughlin
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
package geo
|
||||
|
||||
import (
|
||||
"math"
|
||||
)
|
||||
|
||||
// encoding encapsulates an encoding defined by a given base32 alphabet.
|
||||
type encoding struct {
|
||||
enc string
|
||||
dec [256]byte
|
||||
}
|
||||
|
||||
// newEncoding constructs a new encoding defined by the given alphabet,
|
||||
// which must be a 32-byte string.
|
||||
func newEncoding(encoder string) *encoding {
|
||||
e := new(encoding)
|
||||
e.enc = encoder
|
||||
for i := 0; i < len(e.dec); i++ {
|
||||
e.dec[i] = 0xff
|
||||
}
|
||||
for i := 0; i < len(encoder); i++ {
|
||||
e.dec[encoder[i]] = byte(i)
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// Decode string into bits of a 64-bit word. The string s may be at most 12
|
||||
// characters.
|
||||
func (e *encoding) decode(s string) uint64 {
|
||||
x := uint64(0)
|
||||
for i := 0; i < len(s); i++ {
|
||||
x = (x << 5) | uint64(e.dec[s[i]])
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
// Encode bits of 64-bit word into a string.
|
||||
func (e *encoding) encode(x uint64) string {
|
||||
b := [12]byte{}
|
||||
for i := 0; i < 12; i++ {
|
||||
b[11-i] = e.enc[x&0x1f]
|
||||
x >>= 5
|
||||
}
|
||||
return string(b[:])
|
||||
}
|
||||
|
||||
// Base32Encoding with the Geohash alphabet.
|
||||
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz")
|
||||
|
||||
// BoundingBox returns the region encoded by the given string geohash.
|
||||
func geoBoundingBox(hash string) geoBox {
|
||||
bits := uint(5 * len(hash))
|
||||
inthash := base32encoding.decode(hash)
|
||||
return geoBoundingBoxIntWithPrecision(inthash, bits)
|
||||
}
|
||||
|
||||
// Box represents a rectangle in latitude/longitude space.
|
||||
type geoBox struct {
|
||||
minLat float64
|
||||
maxLat float64
|
||||
minLng float64
|
||||
maxLng float64
|
||||
}
|
||||
|
||||
// Round returns a point inside the box, making an effort to round to minimal
|
||||
// precision.
|
||||
func (b geoBox) round() (lat, lng float64) {
|
||||
x := maxDecimalPower(b.maxLat - b.minLat)
|
||||
lat = math.Ceil(b.minLat/x) * x
|
||||
x = maxDecimalPower(b.maxLng - b.minLng)
|
||||
lng = math.Ceil(b.minLng/x) * x
|
||||
return
|
||||
}
|
||||
|
||||
// precalculated for performance
|
||||
var exp232 = math.Exp2(32)
|
||||
|
||||
// errorWithPrecision returns the error range in latitude and longitude for in
|
||||
// integer geohash with bits of precision.
|
||||
func errorWithPrecision(bits uint) (latErr, lngErr float64) {
|
||||
b := int(bits)
|
||||
latBits := b / 2
|
||||
lngBits := b - latBits
|
||||
latErr = math.Ldexp(180.0, -latBits)
|
||||
lngErr = math.Ldexp(360.0, -lngBits)
|
||||
return
|
||||
}
|
||||
|
||||
// minDecimalPlaces returns the minimum number of decimal places such that
|
||||
// there must exist an number with that many places within any range of width
|
||||
// r. This is intended for returning minimal precision coordinates inside a
|
||||
// box.
|
||||
func maxDecimalPower(r float64) float64 {
|
||||
m := int(math.Floor(math.Log10(r)))
|
||||
return math.Pow10(m)
|
||||
}
|
||||
|
||||
// Encode the position of x within the range -r to +r as a 32-bit integer.
|
||||
func encodeRange(x, r float64) uint32 {
|
||||
p := (x + r) / (2 * r)
|
||||
return uint32(p * exp232)
|
||||
}
|
||||
|
||||
// Decode the 32-bit range encoding X back to a value in the range -r to +r.
|
||||
func decodeRange(X uint32, r float64) float64 {
|
||||
p := float64(X) / exp232
|
||||
x := 2*r*p - r
|
||||
return x
|
||||
}
|
||||
|
||||
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are
|
||||
// ignored, and may take any value.
|
||||
func squash(X uint64) uint32 {
|
||||
X &= 0x5555555555555555
|
||||
X = (X | (X >> 1)) & 0x3333333333333333
|
||||
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f
|
||||
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff
|
||||
X = (X | (X >> 8)) & 0x0000ffff0000ffff
|
||||
X = (X | (X >> 16)) & 0x00000000ffffffff
|
||||
return uint32(X)
|
||||
}
|
||||
|
||||
// Deinterleave the bits of X into 32-bit words containing the even and odd
|
||||
// bitlevels of X, respectively.
|
||||
func deinterleave(X uint64) (uint32, uint32) {
|
||||
return squash(X), squash(X >> 1)
|
||||
}
|
||||
|
||||
// BoundingBoxIntWithPrecision returns the region encoded by the integer
|
||||
// geohash with the specified precision.
|
||||
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox {
|
||||
fullHash := hash << (64 - bits)
|
||||
latInt, lngInt := deinterleave(fullHash)
|
||||
lat := decodeRange(latInt, 90)
|
||||
lng := decodeRange(lngInt, 180)
|
||||
latErr, lngErr := errorWithPrecision(bits)
|
||||
return geoBox{
|
||||
minLat: lat,
|
||||
maxLat: lat + latErr,
|
||||
minLng: lng,
|
||||
maxLng: lng + lngErr,
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// Decode the string geohash to a (lat, lng) point.
|
||||
func GeoHashDecode(hash string) (lat, lng float64) {
|
||||
box := geoBoundingBox(hash)
|
||||
return box.round()
|
||||
}
|
@ -0,0 +1,420 @@
|
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
)
|
||||
|
||||
var OptimizeConjunction = true
|
||||
var OptimizeConjunctionUnadorned = true
|
||||
var OptimizeDisjunctionUnadorned = true
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if OptimizeConjunction && kind == "conjunction" {
|
||||
return s.optimizeConjunction(octx)
|
||||
}
|
||||
|
||||
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
|
||||
return s.optimizeConjunctionUnadorned(octx)
|
||||
}
|
||||
|
||||
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
|
||||
return s.optimizeDisjunctionUnadorned(octx)
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunction)
|
||||
if !ok {
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRConjunction struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr0.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr1.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
|
||||
|
||||
for _, tfr := range o.tfrs[2:] {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
bm.And(itr.ActualBM)
|
||||
}
|
||||
|
||||
// in this conjunction optimization, the postings iterators
|
||||
// will all share the same AND'ed together actual bitmap. The
|
||||
// regular conjunction searcher machinery will still be used,
|
||||
// but the underlying bitmap will be smaller.
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if ok && itr.ActualBM != nil {
|
||||
itr.ActualBM = bm
|
||||
itr.Actual = bm.Iterator()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" conjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRConjunctionUnadorned struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
|
||||
var OptimizeTFRConjunctionUnadornedField = "*"
|
||||
|
||||
// Finish of an unadorned conjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps AND'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{
|
||||
term: OptimizeTFRConjunctionUnadornedTerm,
|
||||
field: OptimizeTFRConjunctionUnadornedField,
|
||||
snapshot: o.snapshot,
|
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||
segmentOffset: 0,
|
||||
includeFreq: false,
|
||||
includeNorm: false,
|
||||
includeTermVectors: false,
|
||||
}
|
||||
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
OUTER:
|
||||
for i := range o.snapshot.segment {
|
||||
actualBMs = actualBMs[:0]
|
||||
|
||||
var docNum1HitLast uint64
|
||||
var docNum1HitLastOk bool
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
|
||||
// An empty postings iterator means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
// We optimize zap postings iterators only.
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// If the postings iterator is "1-hit" optimized, then we
|
||||
// can perform several optimizations up-front here.
|
||||
docNum1Hit, ok := itr.DocNum1Hit()
|
||||
if ok {
|
||||
if docNum1Hit == zap.DocNum1HitFinished {
|
||||
// An empty docNum here means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
|
||||
// The docNum1Hit doesn't match the previous
|
||||
// docNum1HitLast, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
docNum1HitLast = docNum1Hit
|
||||
docNum1HitLastOk = true
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if itr.ActualBM == nil {
|
||||
// An empty actual bitmap means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
// Collect the actual bitmap for more processing later.
|
||||
actualBMs = append(actualBMs, itr.ActualBM)
|
||||
}
|
||||
|
||||
if docNum1HitLastOk {
|
||||
// We reach here if all the 1-hit optimized posting
|
||||
// iterators had the same 1-hit docNum, so we can check if
|
||||
// our collected actual bitmaps also have that docNum.
|
||||
for _, bm := range actualBMs {
|
||||
if !bm.Contains(uint32(docNum1HitLast)) {
|
||||
// The docNum1Hit isn't in one of our actual
|
||||
// bitmaps, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
|
||||
// The actual bitmaps and docNum1Hits all contain or have
|
||||
// the same 1-hit docNum, so that's our AND'ed result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit(
|
||||
docNum1HitLast, zap.NormBits1Hit, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if len(actualBMs) == 0 {
|
||||
// If we've collected no actual bitmaps at this point,
|
||||
// then the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if len(actualBMs) == 1 {
|
||||
// If we've only 1 actual bitmap, then that's our result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||
actualBMs[0], false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
// Else, AND together our collected bitmaps as our result.
|
||||
bm := roaring.And(actualBMs[0], actualBMs[1])
|
||||
|
||||
for _, actualBM := range actualBMs[2:] {
|
||||
bm.And(actualBM)
|
||||
}
|
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||
bm, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return oTFR, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" disjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRDisjunctionUnadorned struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
|
||||
var OptimizeTFRDisjunctionUnadornedField = "*"
|
||||
|
||||
// Finish of an unadorned disjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps OR'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
var cMax uint64
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if itr.ActualBM != nil {
|
||||
c := itr.ActualBM.GetCardinality()
|
||||
if cMax < c {
|
||||
cMax = c
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic to skip the optimization if all the constituent
|
||||
// bitmaps are too small, where the processing & resource
|
||||
// overhead to create the OR'ed bitmap outweighs the benefit.
|
||||
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{
|
||||
term: OptimizeTFRDisjunctionUnadornedTerm,
|
||||
field: OptimizeTFRDisjunctionUnadornedField,
|
||||
snapshot: o.snapshot,
|
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||
segmentOffset: 0,
|
||||
includeFreq: false,
|
||||
includeNorm: false,
|
||||
includeTermVectors: false,
|
||||
}
|
||||
|
||||
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
docNums = docNums[:0]
|
||||
actualBMs = actualBMs[:0]
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
docNum, ok := itr.DocNum1Hit()
|
||||
if ok {
|
||||
docNums = append(docNums, uint32(docNum))
|
||||
continue
|
||||
}
|
||||
|
||||
if itr.ActualBM != nil {
|
||||
actualBMs = append(actualBMs, itr.ActualBM)
|
||||
}
|
||||
}
|
||||
|
||||
var bm *roaring.Bitmap
|
||||
if len(actualBMs) > 2 {
|
||||
bm = roaring.HeapOr(actualBMs...)
|
||||
} else if len(actualBMs) == 2 {
|
||||
bm = roaring.Or(actualBMs[0], actualBMs[1])
|
||||
} else if len(actualBMs) == 1 {
|
||||
bm = actualBMs[0].Clone()
|
||||
}
|
||||
|
||||
if bm == nil {
|
||||
bm = roaring.New()
|
||||
}
|
||||
|
||||
bm.AddMany(docNums)
|
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return oTFR, nil
|
||||
}
|
@ -1,110 +0,0 @@
|
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
type Reader struct {
|
||||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
|
||||
}
|
||||
|
||||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq,
|
||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
||||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors)
|
||||
}
|
||||
|
||||
// DocIDReader returns an iterator over all doc ids
|
||||
// The caller must close returned instance to release associated resources.
|
||||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) {
|
||||
return r.root.DocIDReaderAll()
|
||||
}
|
||||
|
||||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
|
||||
return r.root.DocIDReaderOnly(ids)
|
||||
}
|
||||
|
||||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) {
|
||||
return r.root.FieldDict(field)
|
||||
}
|
||||
|
||||
// FieldDictRange is currently defined to include the start and end terms
|
||||
func (r *Reader) FieldDictRange(field string, startTerm []byte,
|
||||
endTerm []byte) (index.FieldDict, error) {
|
||||
return r.root.FieldDictRange(field, startTerm, endTerm)
|
||||
}
|
||||
|
||||
func (r *Reader) FieldDictPrefix(field string,
|
||||
termPrefix []byte) (index.FieldDict, error) {
|
||||
return r.root.FieldDictPrefix(field, termPrefix)
|
||||
}
|
||||
|
||||
func (r *Reader) Document(id string) (*document.Document, error) {
|
||||
return r.root.Document(id)
|
||||
}
|
||||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
return r.root.DocumentVisitFieldTerms(id, fields, visitor)
|
||||
}
|
||||
|
||||
func (r *Reader) Fields() ([]string, error) {
|
||||
return r.root.Fields()
|
||||
}
|
||||
|
||||
func (r *Reader) GetInternal(key []byte) ([]byte, error) {
|
||||
return r.root.GetInternal(key)
|
||||
}
|
||||
|
||||
func (r *Reader) DocCount() (uint64, error) {
|
||||
return r.root.DocCount()
|
||||
}
|
||||
|
||||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) {
|
||||
return r.root.ExternalID(id)
|
||||
}
|
||||
|
||||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
|
||||
return r.root.InternalID(id)
|
||||
}
|
||||
|
||||
func (r *Reader) DumpAll() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) DumpDoc(id string) chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) DumpFields() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) Close() error {
|
||||
return r.root.DecRef()
|
||||
}
|
@ -1,321 +0,0 @@
|
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
|
||||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
|
||||
s := New()
|
||||
|
||||
// ensure that _id field get fieldID 0
|
||||
s.getOrDefineField("_id")
|
||||
|
||||
// fill Dicts/DictKeys and preallocate memory
|
||||
s.initializeDict(results)
|
||||
|
||||
// walk each doc
|
||||
for _, result := range results {
|
||||
s.processDocument(result)
|
||||
}
|
||||
|
||||
// go back and sort the dictKeys
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
// compute memory usage of segment
|
||||
s.updateSizeInBytes()
|
||||
|
||||
// professional debugging
|
||||
//
|
||||
// log.Printf("fields: %v\n", s.FieldsMap)
|
||||
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
|
||||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
|
||||
// log.Printf("dicts: %v\n", s.Dicts)
|
||||
// log.Printf("dict keys: %v\n", s.DictKeys)
|
||||
// for i, posting := range s.Postings {
|
||||
// log.Printf("posting %d: %v\n", i, posting)
|
||||
// }
|
||||
// for i, freq := range s.Freqs {
|
||||
// log.Printf("freq %d: %v\n", i, freq)
|
||||
// }
|
||||
// for i, norm := range s.Norms {
|
||||
// log.Printf("norm %d: %v\n", i, norm)
|
||||
// }
|
||||
// for i, field := range s.Locfields {
|
||||
// log.Printf("field %d: %v\n", i, field)
|
||||
// }
|
||||
// for i, start := range s.Locstarts {
|
||||
// log.Printf("start %d: %v\n", i, start)
|
||||
// }
|
||||
// for i, end := range s.Locends {
|
||||
// log.Printf("end %d: %v\n", i, end)
|
||||
// }
|
||||
// for i, pos := range s.Locpos {
|
||||
// log.Printf("pos %d: %v\n", i, pos)
|
||||
// }
|
||||
// for i, apos := range s.Locarraypos {
|
||||
// log.Printf("apos %d: %v\n", i, apos)
|
||||
// }
|
||||
// log.Printf("stored: %v\n", s.Stored)
|
||||
// log.Printf("stored types: %v\n", s.StoredTypes)
|
||||
// log.Printf("stored pos: %v\n", s.StoredPos)
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// fill Dicts/DictKeys and preallocate memory for postings
|
||||
func (s *Segment) initializeDict(results []*index.AnalysisResult) {
|
||||
var numPostingsLists int
|
||||
|
||||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
|
||||
var numTokenFrequencies int
|
||||
var totLocs int
|
||||
|
||||
// initial scan for all fieldID's to sort them
|
||||
for _, result := range results {
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
for _, field := range result.Document.Fields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
}
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
|
||||
for fieldID, fieldName := range s.FieldsInv {
|
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||
}
|
||||
|
||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||
for term, tf := range tfs {
|
||||
pidPlus1, exists := s.Dicts[fieldID][term]
|
||||
if !exists {
|
||||
numPostingsLists++
|
||||
pidPlus1 = uint64(numPostingsLists)
|
||||
s.Dicts[fieldID][term] = pidPlus1
|
||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
|
||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
|
||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
|
||||
}
|
||||
pid := pidPlus1 - 1
|
||||
numTermsPerPostingsList[pid] += 1
|
||||
numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||
totLocs += len(tf.Locations)
|
||||
}
|
||||
numTokenFrequencies += len(tfs)
|
||||
}
|
||||
|
||||
for _, result := range results {
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
_, tf := field.Analyze()
|
||||
processField(fieldID, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
tf := result.Analyzed[i]
|
||||
processField(fieldID, tf)
|
||||
}
|
||||
}
|
||||
|
||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists)
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
s.Postings[i] = roaring.New()
|
||||
}
|
||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists)
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
s.PostingsLocs[i] = roaring.New()
|
||||
}
|
||||
|
||||
// Preallocate big, contiguous backing arrays.
|
||||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
|
||||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
|
||||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
|
||||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
|
||||
|
||||
// Point top-level slices to the backing arrays.
|
||||
s.Freqs = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Norms = make([][]float32, numPostingsLists)
|
||||
|
||||
s.Locfields = make([][]uint16, numPostingsLists)
|
||||
|
||||
s.Locstarts = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locends = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locpos = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locarraypos = make([][][]uint64, numPostingsLists)
|
||||
|
||||
// Point sub-slices to the backing arrays.
|
||||
for pid, numTerms := range numTermsPerPostingsList {
|
||||
s.Freqs[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numTerms:]
|
||||
|
||||
s.Norms[pid] = float32Backing[0:0]
|
||||
float32Backing = float32Backing[numTerms:]
|
||||
}
|
||||
|
||||
for pid, numLocs := range numLocsPerPostingsList {
|
||||
s.Locfields[pid] = uint16Backing[0:0]
|
||||
uint16Backing = uint16Backing[numLocs:]
|
||||
|
||||
s.Locstarts[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locends[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locpos[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locarraypos[pid] = auint64Backing[0:0]
|
||||
auint64Backing = auint64Backing[numLocs:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) processDocument(result *index.AnalysisResult) {
|
||||
// used to collate information across fields
|
||||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap))
|
||||
fieldLens := make(map[uint16]int, len(s.FieldsMap))
|
||||
|
||||
docNum := uint64(s.addDocument())
|
||||
|
||||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) {
|
||||
fieldLens[field] += l
|
||||
if existingFreqs, ok := docMap[field]; ok {
|
||||
existingFreqs.MergeAll(name, tf)
|
||||
} else {
|
||||
docMap[field] = tf
|
||||
}
|
||||
}
|
||||
|
||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
|
||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
|
||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
|
||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
l, tf := field.Analyze()
|
||||
processField(fieldID, field.Name(), l, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
l := result.Length[i]
|
||||
tf := result.Analyzed[i]
|
||||
processField(fieldID, field.Name(), l, tf)
|
||||
if field.Options().IsStored() {
|
||||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
|
||||
}
|
||||
|
||||
if field.Options().IncludeDocValues() {
|
||||
s.DocValueFields[fieldID] = true
|
||||
}
|
||||
}
|
||||
|
||||
// now that its been rolled up into docMap, walk that
|
||||
for fieldID, tokenFrequencies := range docMap {
|
||||
for term, tokenFreq := range tokenFrequencies {
|
||||
pid := s.Dicts[fieldID][term] - 1
|
||||
bs := s.Postings[pid]
|
||||
bs.AddInt(int(docNum))
|
||||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency()))
|
||||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
|
||||
locationBS := s.PostingsLocs[pid]
|
||||
if len(tokenFreq.Locations) > 0 {
|
||||
locationBS.AddInt(int(docNum))
|
||||
for _, loc := range tokenFreq.Locations {
|
||||
var locf = fieldID
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field))
|
||||
}
|
||||
s.Locfields[pid] = append(s.Locfields[pid], locf)
|
||||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start))
|
||||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End))
|
||||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position))
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions)
|
||||
} else {
|
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) getOrDefineField(name string) int {
|
||||
fieldIDPlus1, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[name] = fieldIDPlus1
|
||||
s.FieldsInv = append(s.FieldsInv, name)
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
s.DictKeys = append(s.DictKeys, make([]string, 0))
|
||||
}
|
||||
return int(fieldIDPlus1 - 1)
|
||||
}
|
||||
|
||||
func (s *Segment) addDocument() int {
|
||||
docNum := len(s.Stored)
|
||||
s.Stored = append(s.Stored, map[uint16][][]byte{})
|
||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
|
||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
|
||||
return docNum
|
||||
}
|
||||
|
||||
func encodeFieldType(f document.Field) byte {
|
||||
fieldType := byte('x')
|
||||
switch f.(type) {
|
||||
case *document.TextField:
|
||||
fieldType = 't'
|
||||
case *document.NumericField:
|
||||
fieldType = 'n'
|
||||
case *document.DateTimeField:
|
||||
fieldType = 'd'
|
||||
case *document.BooleanField:
|
||||
fieldType = 'b'
|
||||
case *document.GeoPointField:
|
||||
fieldType = 'g'
|
||||
case *document.CompositeField:
|
||||
fieldType = 'c'
|
||||
}
|
||||
return fieldType
|
||||
}
|
@ -1,103 +0,0 @@
|
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// Dictionary is the in-memory representation of the term dictionary
|
||||
type Dictionary struct {
|
||||
segment *Segment
|
||||
field string
|
||||
fieldID uint16
|
||||
}
|
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string,
|
||||
except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
return &PostingsList{
|
||||
dictionary: d,
|
||||
term: term,
|
||||
postingsID: d.segment.Dicts[d.fieldID][term],
|
||||
except: except,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this dictionary
|
||||
func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
}
|
||||
|
||||
// PrefixIterator returns an iterator which only visits terms having the
|
||||
// the specified prefix
|
||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
prefix: prefix,
|
||||
offset: offset,
|
||||
}
|
||||
}
|
||||
|
||||
// RangeIterator returns an iterator which only visits terms between the
|
||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
offset: offset,
|
||||
end: end,
|
||||
}
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
prefix string
|
||||
end string
|
||||
offset int
|
||||
|
||||
dictEntry index.DictEntry // reused across Next()'s
|
||||
}
|
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
|
||||
return nil, nil
|
||||
}
|
||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
|
||||
// check prefix
|
||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
|
||||
return nil, nil
|
||||
}
|
||||
// check end (bleve.index API demands inclusive end)
|
||||
if d.end != "" && next > d.end {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
d.offset++
|
||||
postingID := d.d.segment.Dicts[d.d.fieldID][next]
|
||||
d.dictEntry.Term = next
|
||||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality()
|
||||
return &d.dictEntry, nil
|
||||
}
|
@ -1,178 +0,0 @@
|
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// PostingsList is an in-memory represenation of a postings list
|
||||
type PostingsList struct {
|
||||
dictionary *Dictionary
|
||||
term string
|
||||
postingsID uint64
|
||||
except *roaring.Bitmap
|
||||
}
|
||||
|
||||
// Count returns the number of items on this postings list
|
||||
func (p *PostingsList) Count() uint64 {
|
||||
var rv uint64
|
||||
if p.postingsID > 0 {
|
||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
|
||||
if p.except != nil {
|
||||
except := p.except.GetCardinality()
|
||||
if except > rv {
|
||||
// avoid underflow
|
||||
except = rv
|
||||
}
|
||||
rv -= except
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this postings list
|
||||
func (p *PostingsList) Iterator() segment.PostingsIterator {
|
||||
rv := &PostingsIterator{
|
||||
postings: p,
|
||||
}
|
||||
if p.postingsID > 0 {
|
||||
allbits := p.dictionary.segment.Postings[p.postingsID-1]
|
||||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1]
|
||||
rv.all = allbits.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := allbits.Clone()
|
||||
allExcept.AndNot(p.except)
|
||||
rv.actual = allExcept.Iterator()
|
||||
} else {
|
||||
rv.actual = allbits.Iterator()
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// PostingsIterator provides a way to iterate through the postings list
|
||||
type PostingsIterator struct {
|
||||
postings *PostingsList
|
||||
all roaring.IntIterable
|
||||
locations *roaring.Bitmap
|
||||
offset int
|
||||
locoffset int
|
||||
actual roaring.IntIterable
|
||||
}
|
||||
|
||||
// Next returns the next posting on the postings list, or nil at the end
|
||||
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
||||
if i.actual == nil || !i.actual.HasNext() {
|
||||
return nil, nil
|
||||
}
|
||||
n := i.actual.Next()
|
||||
allN := i.all.Next()
|
||||
|
||||
// n is the next actual hit (excluding some postings)
|
||||
// allN is the next hit in the full postings
|
||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||
// incr the all iterator, and check again
|
||||
for allN != n {
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
allN = i.all.Next()
|
||||
}
|
||||
rv := &Posting{
|
||||
iterator: i,
|
||||
docNum: uint64(n),
|
||||
offset: i.offset,
|
||||
locoffset: i.locoffset,
|
||||
hasLoc: i.locations.Contains(n),
|
||||
}
|
||||
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Posting is a single entry in a postings list
|
||||
type Posting struct {
|
||||
iterator *PostingsIterator
|
||||
docNum uint64
|
||||
offset int
|
||||
locoffset int
|
||||
hasLoc bool
|
||||
}
|
||||
|
||||
// Number returns the document number of this posting in this segment
|
||||
func (p *Posting) Number() uint64 {
|
||||
return p.docNum
|
||||
}
|
||||
|
||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
||||
func (p *Posting) Frequency() uint64 {
|
||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
|
||||
}
|
||||
|
||||
// Norm returns the normalization factor for this posting
|
||||
func (p *Posting) Norm() float64 {
|
||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
|
||||
}
|
||||
|
||||
// Locations returns the location information for each occurance
|
||||
func (p *Posting) Locations() []segment.Location {
|
||||
if !p.hasLoc {
|
||||
return nil
|
||||
}
|
||||
freq := int(p.Frequency())
|
||||
rv := make([]segment.Location, freq)
|
||||
for i := 0; i < freq; i++ {
|
||||
rv[i] = &Location{
|
||||
p: p,
|
||||
offset: p.locoffset + i,
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// Location represents the location of a single occurance
|
||||
type Location struct {
|
||||
p *Posting
|
||||
offset int
|
||||
}
|
||||
|
||||
// Field returns the name of the field (useful in composite fields to know
|
||||
// which original field the value came from)
|
||||
func (l *Location) Field() string {
|
||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
|
||||
}
|
||||
|
||||
// Start returns the start byte offset of this occurance
|
||||
func (l *Location) Start() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// End returns the end byte offset of this occurance
|
||||
func (l *Location) End() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// Pos returns the 1-based phrase position of this occurance
|
||||
func (l *Location) Pos() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// ArrayPositions returns the array position vector associated with this occurance
|
||||
func (l *Location) ArrayPositions() []uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
@ -1,289 +0,0 @@
|
||||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// _id field is always guaranteed to have fieldID of 0
|
||||
const idFieldID uint16 = 0
|
||||
|
||||
// KNOWN ISSUES
|
||||
// - LIMITATION - we decided whether or not to store term vectors for a field
|
||||
// at the segment level, based on the first definition of a
|
||||
// field we see. in normal bleve usage this is fine, all
|
||||
// instances of a field definition will be the same. however,
|
||||
// advanced users may violate this and provide unique field
|
||||
// definitions with each document. this segment does not
|
||||
// support this usage.
|
||||
|
||||
// TODO
|
||||
// - need better testing of multiple docs, iterating freqs, locations and
|
||||
// and verifying the correct results are returned
|
||||
|
||||
// Segment is an in memory implementation of scorch.Segment
|
||||
type Segment struct {
|
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16
|
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string
|
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string
|
||||
|
||||
// Postings list
|
||||
// postings list id -> bitmap by docNum
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// Postings list has locations
|
||||
PostingsLocs []*roaring.Bitmap
|
||||
|
||||
// Term frequencies
|
||||
// postings list id -> Freqs (one for each hit in bitmap)
|
||||
Freqs [][]uint64
|
||||
|
||||
// Field norms
|
||||
// postings list id -> Norms (one for each hit in bitmap)
|
||||
Norms [][]float32
|
||||
|
||||
// Field/start/end/pos/locarraypos
|
||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
||||
Locfields [][]uint16
|
||||
Locstarts [][]uint64
|
||||
Locends [][]uint64
|
||||
Locpos [][]uint64
|
||||
Locarraypos [][][]uint64
|
||||
|
||||
// Stored field values
|
||||
// docNum -> field id -> slice of values (each value []byte)
|
||||
Stored []map[uint16][][]byte
|
||||
|
||||
// Stored field types
|
||||
// docNum -> field id -> slice of types (each type byte)
|
||||
StoredTypes []map[uint16][]byte
|
||||
|
||||
// Stored field array positions
|
||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
||||
StoredPos []map[uint16][][]uint64
|
||||
|
||||
// For storing the docValue persisted fields
|
||||
DocValueFields map[uint16]bool
|
||||
|
||||
// Footprint of the segment, updated when analyzed document mutations
|
||||
// are added into the segment
|
||||
sizeInBytes uint64
|
||||
}
|
||||
|
||||
// New builds a new empty Segment
|
||||
func New() *Segment {
|
||||
return &Segment{
|
||||
FieldsMap: map[string]uint16{},
|
||||
DocValueFields: map[uint16]bool{},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) updateSizeInBytes() {
|
||||
var sizeInBytes uint64
|
||||
|
||||
// FieldsMap, FieldsInv
|
||||
for k, _ := range s.FieldsMap {
|
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
||||
2 /* size of uint16 */)
|
||||
}
|
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
||||
|
||||
// Dicts, DictKeys
|
||||
for _, entry := range s.Dicts {
|
||||
for k, _ := range entry {
|
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
||||
8 /* size of uint64 */)
|
||||
}
|
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Postings, PostingsLocs
|
||||
for i := 0; i < len(s.Postings); i++ {
|
||||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) +
|
||||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Freqs, Norms
|
||||
for i := 0; i < len(s.Freqs); i++ {
|
||||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ +
|
||||
len(s.Norms[i])*4 /* size of float32 */) +
|
||||
(segment.SizeOfSlice * 2)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Location data
|
||||
for i := 0; i < len(s.Locfields); i++ {
|
||||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ +
|
||||
len(s.Locstarts[i])*8 /* size of uint64 */ +
|
||||
len(s.Locends[i])*8 /* size of uint64 */ +
|
||||
len(s.Locpos[i])*8 /* size of uint64 */)
|
||||
|
||||
for j := 0; j < len(s.Locarraypos[i]); j++ {
|
||||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) +
|
||||
segment.SizeOfSlice
|
||||
}
|
||||
|
||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
||||
|
||||
// Stored data
|
||||
for i := 0; i < len(s.Stored); i++ {
|
||||
for _, v := range s.Stored[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
||||
for _, arr := range v {
|
||||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice
|
||||
}
|
||||
sizeInBytes += segment.SizeOfSlice
|
||||
}
|
||||
|
||||
for _, v := range s.StoredTypes[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice
|
||||
}
|
||||
|
||||
for _, v := range s.StoredPos[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
||||
for _, arr := range v {
|
||||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) +
|
||||
segment.SizeOfSlice
|
||||
}
|
||||
sizeInBytes += segment.SizeOfSlice
|
||||
}
|
||||
|
||||
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfMap * 3)
|
||||
}
|
||||
// overhead from data structures: Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfSlice * 3)
|
||||
|
||||
// DocValueFields
|
||||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) +
|
||||
segment.SizeOfMap
|
||||
|
||||
// SizeInBytes
|
||||
sizeInBytes += uint64(8)
|
||||
|
||||
s.sizeInBytes = sizeInBytes
|
||||
}
|
||||
|
||||
func (s *Segment) SizeInBytes() uint64 {
|
||||
return s.sizeInBytes
|
||||
}
|
||||
|
||||
func (s *Segment) AddRef() {
|
||||
}
|
||||
|
||||
func (s *Segment) DecRef() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fields returns the field names used in this segment
|
||||
func (s *Segment) Fields() []string {
|
||||
return s.FieldsInv
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
// ensure document number exists
|
||||
if int(num) > len(s.Stored)-1 {
|
||||
return nil
|
||||
}
|
||||
docFields := s.Stored[int(num)]
|
||||
st := s.StoredTypes[int(num)]
|
||||
sp := s.StoredPos[int(num)]
|
||||
for field, values := range docFields {
|
||||
for i, value := range values {
|
||||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
|
||||
if !keepGoing {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Segment) getField(name string) (int, error) {
|
||||
fieldID, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("no field named %s", name)
|
||||
}
|
||||
return int(fieldID - 1), nil
|
||||
}
|
||||
|
||||
// Dictionary returns the term dictionary for the specified field
|
||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
||||
fieldID, err := s.getField(field)
|
||||
if err != nil {
|
||||
// no such field, return empty dictionary
|
||||
return &segment.EmptyDictionary{}, nil
|
||||
}
|
||||
return &Dictionary{
|
||||
segment: s,
|
||||
field: field,
|
||||
fieldID: uint16(fieldID),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Count returns the number of documents in this segment
|
||||
// (this has no notion of deleted docs)
|
||||
func (s *Segment) Count() uint64 {
|
||||
return uint64(len(s.Stored))
|
||||
}
|
||||
|
||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
||||
// provided _id strings
|
||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
||||
rv := roaring.New()
|
||||
|
||||
// guard against empty segment
|
||||
if len(s.FieldsMap) > 0 {
|
||||
idDictionary := s.Dicts[idFieldID]
|
||||
|
||||
for _, id := range ids {
|
||||
postingID := idDictionary[id]
|
||||
if postingID > 0 {
|
||||
rv.Or(s.Postings[postingID-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Close releases all resources associated with this segment
|
||||
func (s *Segment) Close() error {
|
||||
return nil
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package segment
|
||||
|
||||
import (
|
||||
"regexp/syntax"
|
||||
|
||||
"github.com/couchbase/vellum/regexp"
|
||||
)
|
||||
|
||||
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) {
|
||||
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
|
||||
|
||||
parsed, err := syntax.Parse(pattern, syntax.Perl)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
prefix := LiteralPrefix(parsed)
|
||||
if prefix != "" {
|
||||
prefixBeg := []byte(prefix)
|
||||
prefixEnd := IncrementBytes(prefixBeg)
|
||||
return re, prefixBeg, prefixEnd, nil
|
||||
}
|
||||
|
||||
return re, nil, nil, nil
|
||||
}
|
||||
|
||||
// Returns the literal prefix given the parse tree for a regexp
|
||||
func LiteralPrefix(s *syntax.Regexp) string {
|
||||
// traverse the left-most branch in the parse tree as long as the
|
||||
// node represents a concatenation
|
||||
for s != nil && s.Op == syntax.OpConcat {
|
||||
if len(s.Sub) < 1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
s = s.Sub[0]
|
||||
}
|
||||
|
||||
if s.Op == syntax.OpLiteral {
|
||||
return string(s.Rune)
|
||||
}
|
||||
|
||||
return "" // no literal prefix
|
||||
}
|
||||
|
||||
func IncrementBytes(in []byte) []byte {
|
||||
rv := make([]byte, len(in))
|
||||
copy(rv, in)
|
||||
for i := len(rv) - 1; i >= 0; i-- {
|
||||
rv[i] = rv[i] + 1
|
||||
if rv[i] != 0 {
|
||||
return rv // didn't overflow, so stop
|
||||
}
|
||||
}
|
||||
return nil // overflowed
|
||||
}
|
@ -0,0 +1,826 @@
|
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var NewSegmentBufferNumResultsBump int = 100
|
||||
var NewSegmentBufferNumResultsFactor float64 = 1.0
|
||||
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
|
||||
|
||||
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
|
||||
// SegmentBase from analysis results
|
||||
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
|
||||
chunkFactor uint32) (*SegmentBase, uint64, error) {
|
||||
s := interimPool.Get().(*interim)
|
||||
|
||||
var br bytes.Buffer
|
||||
if s.lastNumDocs > 0 {
|
||||
// use previous results to initialize the buf with an estimate
|
||||
// size, but note that the interim instance comes from a
|
||||
// global interimPool, so multiple scorch instances indexing
|
||||
// different docs can lead to low quality estimates
|
||||
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
|
||||
NewSegmentBufferNumResultsFactor)
|
||||
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
|
||||
NewSegmentBufferAvgBytesPerDocFactor)
|
||||
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
|
||||
}
|
||||
|
||||
s.results = results
|
||||
s.chunkFactor = chunkFactor
|
||||
s.w = NewCountHashWriter(&br)
|
||||
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
|
||||
err := s.convert()
|
||||
if err != nil {
|
||||
return nil, uint64(0), err
|
||||
}
|
||||
|
||||
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
|
||||
s.FieldsMap, s.FieldsInv, uint64(len(results)),
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
|
||||
|
||||
if err == nil && s.reset() == nil {
|
||||
s.lastNumDocs = len(results)
|
||||
s.lastOutSize = len(br.Bytes())
|
||||
interimPool.Put(s)
|
||||
}
|
||||
|
||||
return sb, uint64(len(br.Bytes())), err
|
||||
}
|
||||
|
||||
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
|
||||
|
||||
// interim holds temporary working data used while converting from
|
||||
// analysis results to a zap-encoded segment
|
||||
type interim struct {
|
||||
results []*index.AnalysisResult
|
||||
|
||||
chunkFactor uint32
|
||||
|
||||
w *CountHashWriter
|
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16
|
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string
|
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string
|
||||
|
||||
// Fields whose IncludeDocValues is true
|
||||
// field id -> bool
|
||||
IncludeDocValues []bool
|
||||
|
||||
// postings id -> bitmap of docNums
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// postings id -> freq/norm's, one for each docNum in postings
|
||||
FreqNorms [][]interimFreqNorm
|
||||
freqNormsBacking []interimFreqNorm
|
||||
|
||||
// postings id -> locs, one for each freq
|
||||
Locs [][]interimLoc
|
||||
locsBacking []interimLoc
|
||||
|
||||
numTermsPerPostingsList []int // key is postings list id
|
||||
numLocsPerPostingsList []int // key is postings list id
|
||||
|
||||
builder *vellum.Builder
|
||||
builderBuf bytes.Buffer
|
||||
|
||||
metaBuf bytes.Buffer
|
||||
|
||||
tmp0 []byte
|
||||
tmp1 []byte
|
||||
|
||||
lastNumDocs int
|
||||
lastOutSize int
|
||||
}
|
||||
|
||||
func (s *interim) reset() (err error) {
|
||||
s.results = nil
|
||||
s.chunkFactor = 0
|
||||
s.w = nil
|
||||
s.FieldsMap = nil
|
||||
s.FieldsInv = nil
|
||||
for i := range s.Dicts {
|
||||
s.Dicts[i] = nil
|
||||
}
|
||||
s.Dicts = s.Dicts[:0]
|
||||
for i := range s.DictKeys {
|
||||
s.DictKeys[i] = s.DictKeys[i][:0]
|
||||
}
|
||||
s.DictKeys = s.DictKeys[:0]
|
||||
for i := range s.IncludeDocValues {
|
||||
s.IncludeDocValues[i] = false
|
||||
}
|
||||
s.IncludeDocValues = s.IncludeDocValues[:0]
|
||||
for _, idn := range s.Postings {
|
||||
idn.Clear()
|
||||
}
|
||||
s.Postings = s.Postings[:0]
|
||||
s.FreqNorms = s.FreqNorms[:0]
|
||||
for i := range s.freqNormsBacking {
|
||||
s.freqNormsBacking[i] = interimFreqNorm{}
|
||||
}
|
||||
s.freqNormsBacking = s.freqNormsBacking[:0]
|
||||
s.Locs = s.Locs[:0]
|
||||
for i := range s.locsBacking {
|
||||
s.locsBacking[i] = interimLoc{}
|
||||
}
|
||||
s.locsBacking = s.locsBacking[:0]
|
||||
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
|
||||
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
|
||||
s.builderBuf.Reset()
|
||||
if s.builder != nil {
|
||||
err = s.builder.Reset(&s.builderBuf)
|
||||
}
|
||||
s.metaBuf.Reset()
|
||||
s.tmp0 = s.tmp0[:0]
|
||||
s.tmp1 = s.tmp1[:0]
|
||||
s.lastNumDocs = 0
|
||||
s.lastOutSize = 0
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *interim) grabBuf(size int) []byte {
|
||||
buf := s.tmp0
|
||||
if cap(buf) < size {
|
||||
buf = make([]byte, size)
|
||||
s.tmp0 = buf
|
||||
}
|
||||
return buf[0:size]
|
||||
}
|
||||
|
||||
type interimStoredField struct {
|
||||
vals [][]byte
|
||||
typs []byte
|
||||
arrayposs [][]uint64 // array positions
|
||||
}
|
||||
|
||||
type interimFreqNorm struct {
|
||||
freq uint64
|
||||
norm float32
|
||||
numLocs int
|
||||
}
|
||||
|
||||
type interimLoc struct {
|
||||
fieldID uint16
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
arrayposs []uint64
|
||||
}
|
||||
|
||||
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
||||
s.FieldsMap = map[string]uint16{}
|
||||
|
||||
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||
|
||||
for _, result := range s.results {
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
for _, field := range result.Document.Fields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
}
|
||||
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
|
||||
for fieldID, fieldName := range s.FieldsInv {
|
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||
}
|
||||
|
||||
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
|
||||
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
|
||||
} else {
|
||||
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
|
||||
}
|
||||
|
||||
s.prepareDicts()
|
||||
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
s.processDocuments()
|
||||
|
||||
storedIndexOffset, err := s.writeStoredFields()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
var fdvIndexOffset uint64
|
||||
var dictOffsets []uint64
|
||||
|
||||
if len(s.results) > 0 {
|
||||
fdvIndexOffset, dictOffsets, err = s.writeDicts()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
} else {
|
||||
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||
}
|
||||
|
||||
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
|
||||
}
|
||||
|
||||
func (s *interim) getOrDefineField(fieldName string) int {
|
||||
fieldIDPlus1, exists := s.FieldsMap[fieldName]
|
||||
if !exists {
|
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[fieldName] = fieldIDPlus1
|
||||
s.FieldsInv = append(s.FieldsInv, fieldName)
|
||||
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
|
||||
n := len(s.DictKeys)
|
||||
if n < cap(s.DictKeys) {
|
||||
s.DictKeys = s.DictKeys[:n+1]
|
||||
s.DictKeys[n] = s.DictKeys[n][:0]
|
||||
} else {
|
||||
s.DictKeys = append(s.DictKeys, []string(nil))
|
||||
}
|
||||
}
|
||||
|
||||
return int(fieldIDPlus1 - 1)
|
||||
}
|
||||
|
||||
// fill Dicts and DictKeys from analysis results
|
||||
func (s *interim) prepareDicts() {
|
||||
var pidNext int
|
||||
|
||||
var totTFs int
|
||||
var totLocs int
|
||||
|
||||
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||
dict := s.Dicts[fieldID]
|
||||
dictKeys := s.DictKeys[fieldID]
|
||||
|
||||
for term, tf := range tfs {
|
||||
pidPlus1, exists := dict[term]
|
||||
if !exists {
|
||||
pidNext++
|
||||
pidPlus1 = uint64(pidNext)
|
||||
|
||||
dict[term] = pidPlus1
|
||||
dictKeys = append(dictKeys, term)
|
||||
|
||||
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
|
||||
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
|
||||
}
|
||||
|
||||
pid := pidPlus1 - 1
|
||||
|
||||
s.numTermsPerPostingsList[pid] += 1
|
||||
s.numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||
|
||||
totLocs += len(tf.Locations)
|
||||
}
|
||||
|
||||
totTFs += len(tfs)
|
||||
|
||||
s.DictKeys[fieldID] = dictKeys
|
||||
}
|
||||
|
||||
for _, result := range s.results {
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
_, tf := field.Analyze()
|
||||
visitField(fieldID, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
tf := result.Analyzed[i]
|
||||
visitField(fieldID, tf)
|
||||
}
|
||||
}
|
||||
|
||||
numPostingsLists := pidNext
|
||||
|
||||
if cap(s.Postings) >= numPostingsLists {
|
||||
s.Postings = s.Postings[:numPostingsLists]
|
||||
} else {
|
||||
postings := make([]*roaring.Bitmap, numPostingsLists)
|
||||
copy(postings, s.Postings[:cap(s.Postings)])
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
if postings[i] == nil {
|
||||
postings[i] = roaring.New()
|
||||
}
|
||||
}
|
||||
s.Postings = postings
|
||||
}
|
||||
|
||||
if cap(s.FreqNorms) >= numPostingsLists {
|
||||
s.FreqNorms = s.FreqNorms[:numPostingsLists]
|
||||
} else {
|
||||
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
|
||||
}
|
||||
|
||||
if cap(s.freqNormsBacking) >= totTFs {
|
||||
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
|
||||
} else {
|
||||
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
|
||||
}
|
||||
|
||||
freqNormsBacking := s.freqNormsBacking
|
||||
for pid, numTerms := range s.numTermsPerPostingsList {
|
||||
s.FreqNorms[pid] = freqNormsBacking[0:0]
|
||||
freqNormsBacking = freqNormsBacking[numTerms:]
|
||||
}
|
||||
|
||||
if cap(s.Locs) >= numPostingsLists {
|
||||
s.Locs = s.Locs[:numPostingsLists]
|
||||
} else {
|
||||
s.Locs = make([][]interimLoc, numPostingsLists)
|
||||
}
|
||||
|
||||
if cap(s.locsBacking) >= totLocs {
|
||||
s.locsBacking = s.locsBacking[:totLocs]
|
||||
} else {
|
||||
s.locsBacking = make([]interimLoc, totLocs)
|
||||
}
|
||||
|
||||
locsBacking := s.locsBacking
|
||||
for pid, numLocs := range s.numLocsPerPostingsList {
|
||||
s.Locs[pid] = locsBacking[0:0]
|
||||
locsBacking = locsBacking[numLocs:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) processDocuments() {
|
||||
numFields := len(s.FieldsInv)
|
||||
reuseFieldLens := make([]int, numFields)
|
||||
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
|
||||
|
||||
for docNum, result := range s.results {
|
||||
for i := 0; i < numFields; i++ { // clear these for reuse
|
||||
reuseFieldLens[i] = 0
|
||||
reuseFieldTFs[i] = nil
|
||||
}
|
||||
|
||||
s.processDocument(uint64(docNum), result,
|
||||
reuseFieldLens, reuseFieldTFs)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) processDocument(docNum uint64,
|
||||
result *index.AnalysisResult,
|
||||
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
|
||||
visitField := func(fieldID uint16, fieldName string,
|
||||
ln int, tf analysis.TokenFrequencies) {
|
||||
fieldLens[fieldID] += ln
|
||||
|
||||
existingFreqs := fieldTFs[fieldID]
|
||||
if existingFreqs != nil {
|
||||
existingFreqs.MergeAll(fieldName, tf)
|
||||
} else {
|
||||
fieldTFs[fieldID] = tf
|
||||
}
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
ln, tf := field.Analyze()
|
||||
visitField(fieldID, field.Name(), ln, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
ln := result.Length[i]
|
||||
tf := result.Analyzed[i]
|
||||
visitField(fieldID, field.Name(), ln, tf)
|
||||
}
|
||||
|
||||
// now that it's been rolled up into fieldTFs, walk that
|
||||
for fieldID, tfs := range fieldTFs {
|
||||
dict := s.Dicts[fieldID]
|
||||
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
|
||||
|
||||
for term, tf := range tfs {
|
||||
pid := dict[term] - 1
|
||||
bs := s.Postings[pid]
|
||||
bs.Add(uint32(docNum))
|
||||
|
||||
s.FreqNorms[pid] = append(s.FreqNorms[pid],
|
||||
interimFreqNorm{
|
||||
freq: uint64(tf.Frequency()),
|
||||
norm: norm,
|
||||
numLocs: len(tf.Locations),
|
||||
})
|
||||
|
||||
if len(tf.Locations) > 0 {
|
||||
locs := s.Locs[pid]
|
||||
|
||||
for _, loc := range tf.Locations {
|
||||
var locf = uint16(fieldID)
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field))
|
||||
}
|
||||
var arrayposs []uint64
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
arrayposs = loc.ArrayPositions
|
||||
}
|
||||
locs = append(locs, interimLoc{
|
||||
fieldID: locf,
|
||||
pos: uint64(loc.Position),
|
||||
start: uint64(loc.Start),
|
||||
end: uint64(loc.End),
|
||||
arrayposs: arrayposs,
|
||||
})
|
||||
}
|
||||
|
||||
s.Locs[pid] = locs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) writeStoredFields() (
|
||||
storedIndexOffset uint64, err error) {
|
||||
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||
metaEncode := func(val uint64) (int, error) {
|
||||
wb := binary.PutUvarint(varBuf, val)
|
||||
return s.metaBuf.Write(varBuf[:wb])
|
||||
}
|
||||
|
||||
data, compressed := s.tmp0[:0], s.tmp1[:0]
|
||||
defer func() { s.tmp0, s.tmp1 = data, compressed }()
|
||||
|
||||
// keyed by docNum
|
||||
docStoredOffsets := make([]uint64, len(s.results))
|
||||
|
||||
// keyed by fieldID, for the current doc in the loop
|
||||
docStoredFields := map[uint16]interimStoredField{}
|
||||
|
||||
for docNum, result := range s.results {
|
||||
for fieldID := range docStoredFields { // reset for next doc
|
||||
delete(docStoredFields, fieldID)
|
||||
}
|
||||
|
||||
for _, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
|
||||
opts := field.Options()
|
||||
|
||||
if opts.IsStored() {
|
||||
isf := docStoredFields[fieldID]
|
||||
isf.vals = append(isf.vals, field.Value())
|
||||
isf.typs = append(isf.typs, encodeFieldType(field))
|
||||
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
|
||||
docStoredFields[fieldID] = isf
|
||||
}
|
||||
|
||||
if opts.IncludeDocValues() {
|
||||
s.IncludeDocValues[fieldID] = true
|
||||
}
|
||||
}
|
||||
|
||||
var curr int
|
||||
|
||||
s.metaBuf.Reset()
|
||||
data = data[:0]
|
||||
|
||||
// _id field special case optimizes ExternalID() lookups
|
||||
idFieldVal := docStoredFields[uint16(0)].vals[0]
|
||||
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// handle non-"_id" fields
|
||||
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
|
||||
isf, exists := docStoredFields[uint16(fieldID)]
|
||||
if exists {
|
||||
curr, data, err = persistStoredFieldValues(
|
||||
fieldID, isf.vals, isf.typs, isf.arrayposs,
|
||||
curr, metaEncode, data)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metaBytes := s.metaBuf.Bytes()
|
||||
|
||||
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||
|
||||
docStoredOffsets[docNum] = uint64(s.w.Count())
|
||||
|
||||
_, err := writeUvarints(s.w,
|
||||
uint64(len(metaBytes)),
|
||||
uint64(len(idFieldVal)+len(compressed)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(idFieldVal)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
storedIndexOffset = uint64(s.w.Count())
|
||||
|
||||
for _, docStoredOffset := range docStoredOffsets {
|
||||
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return storedIndexOffset, nil
|
||||
}
|
||||
|
||||
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
|
||||
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||
|
||||
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
|
||||
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
|
||||
|
||||
buf := s.grabBuf(binary.MaxVarintLen64)
|
||||
|
||||
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
|
||||
|
||||
var docTermMap [][]byte
|
||||
|
||||
if s.builder == nil {
|
||||
s.builder, err = vellum.New(&s.builderBuf, nil)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
for fieldID, terms := range s.DictKeys {
|
||||
if cap(docTermMap) < len(s.results) {
|
||||
docTermMap = make([][]byte, len(s.results))
|
||||
} else {
|
||||
docTermMap = docTermMap[0:len(s.results)]
|
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||
}
|
||||
}
|
||||
|
||||
dict := s.Dicts[fieldID]
|
||||
|
||||
for _, term := range terms { // terms are already sorted
|
||||
pid := dict[term] - 1
|
||||
|
||||
postingsBS := s.Postings[pid]
|
||||
|
||||
freqNorms := s.FreqNorms[pid]
|
||||
freqNormOffset := 0
|
||||
|
||||
locs := s.Locs[pid]
|
||||
locOffset := 0
|
||||
|
||||
postingsItr := postingsBS.Iterator()
|
||||
for postingsItr.HasNext() {
|
||||
docNum := uint64(postingsItr.Next())
|
||||
|
||||
freqNorm := freqNorms[freqNormOffset]
|
||||
|
||||
err = tfEncoder.Add(docNum,
|
||||
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
|
||||
uint64(math.Float32bits(freqNorm.norm)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if freqNorm.numLocs > 0 {
|
||||
numBytesLocs := 0
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||
numBytesLocs += totalUvarintBytes(
|
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||
uint64(len(loc.arrayposs)), loc.arrayposs)
|
||||
}
|
||||
|
||||
err = locEncoder.Add(docNum, uint64(numBytesLocs))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||
err = locEncoder.Add(docNum,
|
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||
uint64(len(loc.arrayposs)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
err = locEncoder.Add(docNum, loc.arrayposs...)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
locOffset += freqNorm.numLocs
|
||||
}
|
||||
|
||||
freqNormOffset++
|
||||
|
||||
docTermMap[docNum] = append(
|
||||
append(docTermMap[docNum], term...),
|
||||
termSeparator)
|
||||
}
|
||||
|
||||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
postingsOffset, err :=
|
||||
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if postingsOffset > uint64(0) {
|
||||
err = s.builder.Insert([]byte(term), postingsOffset)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
}
|
||||
|
||||
err = s.builder.Close()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// record where this dictionary starts
|
||||
dictOffsets[fieldID] = uint64(s.w.Count())
|
||||
|
||||
vellumData := s.builderBuf.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
n := binary.PutUvarint(buf, uint64(len(vellumData)))
|
||||
_, err = s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = s.w.Write(vellumData)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// reset vellum for reuse
|
||||
s.builderBuf.Reset()
|
||||
|
||||
err = s.builder.Reset(&s.builderBuf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// write the field doc values
|
||||
if s.IncludeDocValues[fieldID] {
|
||||
for docNum, docTerms := range docTermMap {
|
||||
if len(docTerms) > 0 {
|
||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
|
||||
|
||||
_, err = fdvEncoder.Write()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
|
||||
|
||||
fdvEncoder.Reset()
|
||||
} else {
|
||||
fdvOffsetsStart[fieldID] = fieldNotUninverted
|
||||
fdvOffsetsEnd[fieldID] = fieldNotUninverted
|
||||
}
|
||||
}
|
||||
|
||||
fdvIndexOffset = uint64(s.w.Count())
|
||||
|
||||
for i := 0; i < len(fdvOffsetsStart); i++ {
|
||||
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
|
||||
_, err := s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
|
||||
_, err = s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return fdvIndexOffset, dictOffsets, nil
|
||||
}
|
||||
|
||||
func encodeFieldType(f document.Field) byte {
|
||||
fieldType := byte('x')
|
||||
switch f.(type) {
|
||||
case *document.TextField:
|
||||
fieldType = 't'
|
||||
case *document.NumericField:
|
||||
fieldType = 'n'
|
||||
case *document.DateTimeField:
|
||||
fieldType = 'd'
|
||||
case *document.BooleanField:
|
||||
fieldType = 'b'
|
||||
case *document.GeoPointField:
|
||||
fieldType = 'g'
|
||||
case *document.CompositeField:
|
||||
fieldType = 'c'
|
||||
}
|
||||
return fieldType
|
||||
}
|
||||
|
||||
// returns the total # of bytes needed to encode the given uint64's
|
||||
// into binary.PutUVarint() encoding
|
||||
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
|
||||
n = numUvarintBytes(a)
|
||||
n += numUvarintBytes(b)
|
||||
n += numUvarintBytes(c)
|
||||
n += numUvarintBytes(d)
|
||||
n += numUvarintBytes(e)
|
||||
for _, v := range more {
|
||||
n += numUvarintBytes(v)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
|
||||
func numUvarintBytes(x uint64) (n int) {
|
||||
for x >= 0x80 {
|
||||
x >>= 7
|
||||
n++
|
||||
}
|
||||
return n + 1
|
||||
}
|
File diff suppressed because it is too large
Load Diff
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
@ -0,0 +1,343 @@
|
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package searcher
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/heap"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDisjunctionHeapSearcher int
|
||||
var reflectStaticSizeSearcherCurr int
|
||||
|
||||
func init() {
|
||||
var dhs DisjunctionHeapSearcher
|
||||
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size())
|
||||
|
||||
var sc SearcherCurr
|
||||
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size())
|
||||
}
|
||||
|
||||
type SearcherCurr struct {
|
||||
searcher search.Searcher
|
||||
curr *search.DocumentMatch
|
||||
}
|
||||
|
||||
type DisjunctionHeapSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
|
||||
numSearchers int
|
||||
scorer *scorer.DisjunctionQueryScorer
|
||||
min int
|
||||
queryNorm float64
|
||||
initialized bool
|
||||
searchers []search.Searcher
|
||||
heap []*SearcherCurr
|
||||
|
||||
matching []*search.DocumentMatch
|
||||
matchingCurrs []*SearcherCurr
|
||||
}
|
||||
|
||||
func newDisjunctionHeapSearcher(indexReader index.IndexReader,
|
||||
searchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||
limit bool) (
|
||||
*DisjunctionHeapSearcher, error) {
|
||||
if limit && tooManyClauses(len(searchers)) {
|
||||
return nil, tooManyClausesErr(len(searchers))
|
||||
}
|
||||
|
||||
// build our searcher
|
||||
rv := DisjunctionHeapSearcher{
|
||||
indexReader: indexReader,
|
||||
searchers: searchers,
|
||||
numSearchers: len(searchers),
|
||||
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||
min: int(min),
|
||||
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||
matchingCurrs: make([]*SearcherCurr, len(searchers)),
|
||||
heap: make([]*SearcherCurr, 0, len(searchers)),
|
||||
}
|
||||
rv.computeQueryNorm()
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr +
|
||||
s.scorer.Size()
|
||||
|
||||
for _, entry := range s.searchers {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range s.matching {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
// for matchingCurrs and heap, just use static size * len
|
||||
// since searchers and document matches already counted above
|
||||
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr
|
||||
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
for _, searcher := range s.searchers {
|
||||
sumOfSquaredWeights += searcher.Weight()
|
||||
}
|
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(s.queryNorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||
// alloc a single block of SearcherCurrs
|
||||
block := make([]SearcherCurr, len(s.searchers))
|
||||
|
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers {
|
||||
curr, err := searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if curr != nil {
|
||||
block[i].searcher = searcher
|
||||
block[i].curr = curr
|
||||
heap.Push(s, &block[i])
|
||||
}
|
||||
}
|
||||
|
||||
err := s.updateMatches()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.initialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) updateMatches() error {
|
||||
matching := s.matching[:0]
|
||||
matchingCurrs := s.matchingCurrs[:0]
|
||||
|
||||
if len(s.heap) > 0 {
|
||||
|
||||
// top of the heap is our next hit
|
||||
next := heap.Pop(s).(*SearcherCurr)
|
||||
matching = append(matching, next.curr)
|
||||
matchingCurrs = append(matchingCurrs, next)
|
||||
|
||||
// now as long as top of heap matches, keep popping
|
||||
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 {
|
||||
next = heap.Pop(s).(*SearcherCurr)
|
||||
matching = append(matching, next.curr)
|
||||
matchingCurrs = append(matchingCurrs, next)
|
||||
}
|
||||
}
|
||||
|
||||
s.matching = matching
|
||||
s.matchingCurrs = matchingCurrs
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Weight() float64 {
|
||||
var rv float64
|
||||
for _, searcher := range s.searchers {
|
||||
rv += searcher.Weight()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) {
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(qnorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) (
|
||||
*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var rv *search.DocumentMatch
|
||||
found := false
|
||||
for !found && len(s.matching) > 0 {
|
||||
if len(s.matching) >= s.min {
|
||||
found = true
|
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||
}
|
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, matchingCurr := range s.matchingCurrs {
|
||||
if matchingCurr.curr != rv {
|
||||
ctx.DocumentMatchPool.Put(matchingCurr.curr)
|
||||
}
|
||||
curr, err := matchingCurr.searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if curr != nil {
|
||||
matchingCurr.curr = curr
|
||||
heap.Push(s, matchingCurr)
|
||||
}
|
||||
}
|
||||
|
||||
err := s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext,
|
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// if there is anything in matching, toss it back onto the heap
|
||||
for _, matchingCurr := range s.matchingCurrs {
|
||||
heap.Push(s, matchingCurr)
|
||||
}
|
||||
s.matching = s.matching[:0]
|
||||
s.matchingCurrs = s.matchingCurrs[:0]
|
||||
|
||||
// find all searchers that actually need to be advanced
|
||||
// advance them, using s.matchingCurrs as temp storage
|
||||
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 {
|
||||
searcherCurr := heap.Pop(s).(*SearcherCurr)
|
||||
ctx.DocumentMatchPool.Put(searcherCurr.curr)
|
||||
curr, err := searcherCurr.searcher.Advance(ctx, ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if curr != nil {
|
||||
searcherCurr.curr = curr
|
||||
s.matchingCurrs = append(s.matchingCurrs, searcherCurr)
|
||||
}
|
||||
}
|
||||
// now all of the searchers that we advanced have to be pushed back
|
||||
for _, matchingCurr := range s.matchingCurrs {
|
||||
heap.Push(s, matchingCurr)
|
||||
}
|
||||
// reset our temp space
|
||||
s.matchingCurrs = s.matchingCurrs[:0]
|
||||
|
||||
err := s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s.Next(ctx)
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Count() uint64 {
|
||||
// for now return a worst case
|
||||
var sum uint64
|
||||
for _, searcher := range s.searchers {
|
||||
sum += searcher.Count()
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Close() (rv error) {
|
||||
for _, searcher := range s.searchers {
|
||||
err := searcher.Close()
|
||||
if err != nil && rv == nil {
|
||||
rv = err
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Min() int {
|
||||
return s.min
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int {
|
||||
rv := len(s.searchers)
|
||||
for _, s := range s.searchers {
|
||||
rv += s.DocumentMatchPoolSize()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// a disjunction searcher implements the index.Optimizable interface
|
||||
// but only activates on an edge case where the disjunction is a
|
||||
// wrapper around a single Optimizable child searcher
|
||||
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) (
|
||||
index.OptimizableContext, error) {
|
||||
if len(s.searchers) == 1 {
|
||||
o, ok := s.searchers[0].(index.Optimizable)
|
||||
if ok {
|
||||
return o.Optimize(kind, octx)
|
||||
}
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
// heap impl
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) }
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Less(i, j int) bool {
|
||||
if s.heap[i].curr == nil {
|
||||
return true
|
||||
} else if s.heap[j].curr == nil {
|
||||
return false
|
||||
}
|
||||
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Swap(i, j int) {
|
||||
s.heap[i], s.heap[j] = s.heap[j], s.heap[i]
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Push(x interface{}) {
|
||||
s.heap = append(s.heap, x.(*SearcherCurr))
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Pop() interface{} {
|
||||
old := s.heap
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
s.heap = old[0 : n-1]
|
||||
return x
|
||||
}
|
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
@ -0,0 +1,298 @@
|
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package searcher
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDisjunctionSliceSearcher int
|
||||
|
||||
func init() {
|
||||
var ds DisjunctionSliceSearcher
|
||||
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size())
|
||||
}
|
||||
|
||||
type DisjunctionSliceSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
searchers OrderedSearcherList
|
||||
numSearchers int
|
||||
queryNorm float64
|
||||
currs []*search.DocumentMatch
|
||||
scorer *scorer.DisjunctionQueryScorer
|
||||
min int
|
||||
matching []*search.DocumentMatch
|
||||
matchingIdxs []int
|
||||
initialized bool
|
||||
}
|
||||
|
||||
func newDisjunctionSliceSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||
limit bool) (
|
||||
*DisjunctionSliceSearcher, error) {
|
||||
if limit && tooManyClauses(len(qsearchers)) {
|
||||
return nil, tooManyClausesErr(len(qsearchers))
|
||||
}
|
||||
// build the downstream searchers
|
||||
searchers := make(OrderedSearcherList, len(qsearchers))
|
||||
for i, searcher := range qsearchers {
|
||||
searchers[i] = searcher
|
||||
}
|
||||
// sort the searchers
|
||||
sort.Sort(sort.Reverse(searchers))
|
||||
// build our searcher
|
||||
rv := DisjunctionSliceSearcher{
|
||||
indexReader: indexReader,
|
||||
searchers: searchers,
|
||||
numSearchers: len(searchers),
|
||||
currs: make([]*search.DocumentMatch, len(searchers)),
|
||||
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||
min: int(min),
|
||||
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||
matchingIdxs: make([]int, len(searchers)),
|
||||
}
|
||||
rv.computeQueryNorm()
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr +
|
||||
s.scorer.Size()
|
||||
|
||||
for _, entry := range s.searchers {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range s.currs {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
for _, entry := range s.matching {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
for _, searcher := range s.searchers {
|
||||
sumOfSquaredWeights += searcher.Weight()
|
||||
}
|
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(s.queryNorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||
var err error
|
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers {
|
||||
if s.currs[i] != nil {
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
s.initialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) updateMatches() error {
|
||||
matching := s.matching[:0]
|
||||
matchingIdxs := s.matchingIdxs[:0]
|
||||
|
||||
for i := 0; i < len(s.currs); i++ {
|
||||
curr := s.currs[i]
|
||||
if curr == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(matching) > 0 {
|
||||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
|
||||
if cmp > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if cmp < 0 {
|
||||
matching = matching[:0]
|
||||
matchingIdxs = matchingIdxs[:0]
|
||||
}
|
||||
}
|
||||
|
||||
matching = append(matching, curr)
|
||||
matchingIdxs = append(matchingIdxs, i)
|
||||
}
|
||||
|
||||
s.matching = matching
|
||||
s.matchingIdxs = matchingIdxs
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Weight() float64 {
|
||||
var rv float64
|
||||
for _, searcher := range s.searchers {
|
||||
rv += searcher.Weight()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) {
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(qnorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) (
|
||||
*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
var err error
|
||||
var rv *search.DocumentMatch
|
||||
|
||||
found := false
|
||||
for !found && len(s.matching) > 0 {
|
||||
if len(s.matching) >= s.min {
|
||||
found = true
|
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||
}
|
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, i := range s.matchingIdxs {
|
||||
searcher := s.searchers[i]
|
||||
if s.currs[i] != rv {
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext,
|
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
// get all searchers pointing at their first match
|
||||
var err error
|
||||
for i, searcher := range s.searchers {
|
||||
if s.currs[i] != nil {
|
||||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
|
||||
continue
|
||||
}
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Advance(ctx, ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s.Next(ctx)
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Count() uint64 {
|
||||
// for now return a worst case
|
||||
var sum uint64
|
||||
for _, searcher := range s.searchers {
|
||||
sum += searcher.Count()
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Close() (rv error) {
|
||||
for _, searcher := range s.searchers {
|
||||
err := searcher.Close()
|
||||
if err != nil && rv == nil {
|
||||
rv = err
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Min() int {
|
||||
return s.min
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int {
|
||||
rv := len(s.currs)
|
||||
for _, s := range s.searchers {
|
||||
rv += s.DocumentMatchPoolSize()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// a disjunction searcher implements the index.Optimizable interface
|
||||
// but only activates on an edge case where the disjunction is a
|
||||
// wrapper around a single Optimizable child searcher
|
||||
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) (
|
||||
index.OptimizableContext, error) {
|
||||
if len(s.searchers) == 1 {
|
||||
o, ok := s.searchers[0].(index.Optimizable)
|
||||
if ok {
|
||||
return o.Optimize(kind, octx)
|
||||
}
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue