// Copyright (c) 2016 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package character import ( "unicode/utf8" "github.com/blevesearch/bleve/analysis" ) type IsTokenRune func(r rune) bool type CharacterTokenizer struct { isTokenRun IsTokenRune } func NewCharacterTokenizer(f IsTokenRune) *CharacterTokenizer { return &CharacterTokenizer{ isTokenRun: f, } } func (c *CharacterTokenizer) Tokenize(input []byte) analysis.TokenStream { rv := make(analysis.TokenStream, 0, 1024) offset := 0 start := 0 end := 0 count := 0 for currRune, size := utf8.DecodeRune(input[offset:]); currRune != utf8.RuneError; currRune, size = utf8.DecodeRune(input[offset:]) { isToken := c.isTokenRun(currRune) if isToken { end = offset + size } else { if end-start > 0 { // build token rv = append(rv, &analysis.Token{ Term: input[start:end], Start: start, End: end, Position: count + 1, Type: analysis.AlphaNumeric, }) count++ } start = offset + size end = start } offset += size } // if we ended in the middle of a token, finish it if end-start > 0 { // build token rv = append(rv, &analysis.Token{ Term: input[start:end], Start: start, End: end, Position: count + 1, Type: analysis.AlphaNumeric, }) } return rv }