You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
gitea-fork-majority-judgment/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go

106 lines
2.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package lowercase implements a TokenFilter which converts
// tokens to lower case according to unicode rules.
package lowercase
import (
"bytes"
"unicode"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
// Name is the name used to register LowerCaseFilter in the bleve registry
const Name = "to_lower"
type LowerCaseFilter struct {
}
func NewLowerCaseFilter() *LowerCaseFilter {
return &LowerCaseFilter{}
}
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = toLowerDeferredCopy(token.Term)
}
return input
}
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewLowerCaseFilter(), nil
}
func init() {
registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
}
// toLowerDeferredCopy will function exactly like
// bytes.ToLower() only it will reuse (overwrite)
// the original byte array when possible
// NOTE: because its possible that the lower-case
// form of a rune has a different utf-8 encoded
// length, in these cases a new byte array is allocated
func toLowerDeferredCopy(s []byte) []byte {
j := 0
for i := 0; i < len(s); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
}
l := unicode.ToLower(r)
// If the rune is already lowercased, just move to the
// next rune.
if l == r {
i += wid
j += wid
continue
}
// Handles the Unicode edge-case where the last
// rune in a word on the greek Σ needs to be converted
// differently.
if l == 'σ' && i+2 == len(s) {
l = 'ς'
}
lwid := utf8.RuneLen(l)
if lwid > wid {
// utf-8 encoded replacement is wider
// for now, punt and defer
// to bytes.ToLower() for the remainder
// only known to happen with chars
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
rest := bytes.ToLower(s[i:])
rv := make([]byte, j+len(rest))
copy(rv[:j], s[:j])
copy(rv[j:], rest)
return rv
} else {
utf8.EncodeRune(s[j:], l)
}
i += wid
j += lwid
}
return s[:j]
}