From 4935e6e1a331d60c5e4ca93e59901fe96e557b53 Mon Sep 17 00:00:00 2001 From: Danko Aleksejevs Date: Thu, 3 Jul 2025 11:29:04 +0200 Subject: [PATCH] fix: skip empty tokens in SearchOptions.Tokens() (#8261) Query string tokenizer could return a list containing empty tokens when the query string was `\` or `"` (probably in other scenarios as well). This seems undesirable and is what triggered #8260, but I'm posting this separately from that fix in case I'm wrong. Feel free to reject if so. The actual change in behavior is that now searching for `\` or `"` behaves the same as if the query were empty (the bleve/elastic code checks that the tokenizer actually returned, anything rather than just query being non-empty). ### Tests - I added test coverage for Go changes... - [x] in their respective `*_test.go` for unit tests. ### Documentation - [ ] I created a pull request [to the documentation](https://codeberg.org/forgejo/docs) to explain to Forgejo users how to use this change. - [x] I did not document these changes and I do not expect someone else to do it. ### Release notes - [x] I do not want this change to show in the release notes. - [ ] I want the title to show in the release notes with a link to this pull request. - [ ] I want the content of the `release-notes/.md` to be be used for the release notes instead of the title. Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/8261 Reviewed-by: Shiny Nematoda Co-authored-by: Danko Aleksejevs Co-committed-by: Danko Aleksejevs --- modules/indexer/issues/bleve/bleve.go | 11 ++- .../issues/elasticsearch/elasticsearch.go | 11 ++- modules/indexer/issues/internal/qstring.go | 15 +-- .../indexer/issues/internal/qstring_test.go | 92 +++++++++++++++++++ .../indexer/issues/internal/tests/tests.go | 36 +++++++- 5 files changed, 146 insertions(+), 19 deletions(-) diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go index 8549ba8dfc..cb98f722c5 100644 --- a/modules/indexer/issues/bleve/bleve.go +++ b/modules/indexer/issues/bleve/bleve.go @@ -156,11 +156,12 @@ func (b *Indexer) Delete(_ context.Context, ids ...int64) error { func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) { var queries []query.Query - if options.Keyword != "" { - tokens, err := options.Tokens() - if err != nil { - return nil, err - } + tokens, err := options.Tokens() + if err != nil { + return nil, err + } + + if len(tokens) > 0 { q := bleve.NewBooleanQuery() for _, token := range tokens { innerQ := bleve.NewDisjunctionQuery( diff --git a/modules/indexer/issues/elasticsearch/elasticsearch.go b/modules/indexer/issues/elasticsearch/elasticsearch.go index d632a22b2a..311e92730e 100644 --- a/modules/indexer/issues/elasticsearch/elasticsearch.go +++ b/modules/indexer/issues/elasticsearch/elasticsearch.go @@ -149,12 +149,13 @@ func (b *Indexer) Delete(ctx context.Context, ids ...int64) error { func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) { query := elastic.NewBoolQuery() - if options.Keyword != "" { + tokens, err := options.Tokens() + if err != nil { + return nil, err + } + + if len(tokens) > 0 { q := elastic.NewBoolQuery() - tokens, err := options.Tokens() - if err != nil { - return nil, err - } for _, token := range tokens { innerQ := elastic.NewMultiMatchQuery(token.Term, "content", "comments").FieldWithBoost("title", 2.0).TieBreaker(0.5) if token.Fuzzy { diff --git a/modules/indexer/issues/internal/qstring.go b/modules/indexer/issues/internal/qstring.go index 6b60b4c5f6..348f7a564b 100644 --- a/modules/indexer/issues/internal/qstring.go +++ b/modules/indexer/issues/internal/qstring.go @@ -45,12 +45,9 @@ func (t *Tokenizer) next() (tk Token, err error) { // skip all leading white space for { - if r, _, err = t.in.ReadRune(); err == nil && r == ' ' { - //nolint:staticcheck,wastedassign // SA4006 the variable is used after the loop - r, _, err = t.in.ReadRune() - continue + if r, _, err = t.in.ReadRune(); err != nil || r != ' ' { + break } - break } if err != nil { return tk, err @@ -107,11 +104,17 @@ nextEnd: // Tokenize the keyword func (o *SearchOptions) Tokens() (tokens []Token, err error) { + if o.Keyword == "" { + return nil, nil + } + in := strings.NewReader(o.Keyword) it := Tokenizer{in: in} for token, err := it.next(); err == nil; token, err = it.next() { - tokens = append(tokens, token) + if token.Term != "" { + tokens = append(tokens, token) + } } if err != nil && err != io.EOF { return nil, err diff --git a/modules/indexer/issues/internal/qstring_test.go b/modules/indexer/issues/internal/qstring_test.go index 835491707c..eb4bdb306f 100644 --- a/modules/indexer/issues/internal/qstring_test.go +++ b/modules/indexer/issues/internal/qstring_test.go @@ -41,6 +41,36 @@ var testOpts = []testIssueQueryStringOpt{ }, }, }, + { + Keyword: "Hello World", + Results: []Token{ + { + Term: "Hello", + Fuzzy: true, + Kind: BoolOptShould, + }, + { + Term: "World", + Fuzzy: true, + Kind: BoolOptShould, + }, + }, + }, + { + Keyword: " Hello World ", + Results: []Token{ + { + Term: "Hello", + Fuzzy: true, + Kind: BoolOptShould, + }, + { + Term: "World", + Fuzzy: true, + Kind: BoolOptShould, + }, + }, + }, { Keyword: "+Hello +World", Results: []Token{ @@ -156,6 +186,68 @@ var testOpts = []testIssueQueryStringOpt{ }, }, }, + { + Keyword: "\\", + Results: nil, + }, + { + Keyword: "\"", + Results: nil, + }, + { + Keyword: "Hello \\", + Results: []Token{ + { + Term: "Hello", + Fuzzy: true, + Kind: BoolOptShould, + }, + }, + }, + { + Keyword: "\"\"", + Results: nil, + }, + { + Keyword: "\" World \"", + Results: []Token{ + { + Term: " World ", + Fuzzy: false, + Kind: BoolOptShould, + }, + }, + }, + { + Keyword: "\"\" World \"\"", + Results: []Token{ + { + Term: "World", + Fuzzy: true, + Kind: BoolOptShould, + }, + }, + }, + { + Keyword: "Best \"Hello World\" Ever", + Results: []Token{ + { + Term: "Best", + Fuzzy: true, + Kind: BoolOptShould, + }, + { + Term: "Hello World", + Fuzzy: false, + Kind: BoolOptShould, + }, + { + Term: "Ever", + Fuzzy: true, + Kind: BoolOptShould, + }, + }, + }, } func TestIssueQueryString(t *testing.T) { diff --git a/modules/indexer/issues/internal/tests/tests.go b/modules/indexer/issues/internal/tests/tests.go index b63957ff84..46014994a0 100644 --- a/modules/indexer/issues/internal/tests/tests.go +++ b/modules/indexer/issues/internal/tests/tests.go @@ -87,14 +87,44 @@ func TestIndexer(t *testing.T, indexer internal.Indexer) { } } +func allResults(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) { + assert.Len(t, result.Hits, len(data)) + assert.Equal(t, len(data), int(result.Total)) +} + var cases = []*testIndexerCase{ { Name: "default", SearchOptions: &internal.SearchOptions{}, - Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) { - assert.Len(t, result.Hits, len(data)) - assert.Equal(t, len(data), int(result.Total)) + Expected: allResults, + }, + { + Name: "empty keyword", + SearchOptions: &internal.SearchOptions{ + Keyword: "", }, + Expected: allResults, + }, + { + Name: "whitespace keyword", + SearchOptions: &internal.SearchOptions{ + Keyword: " ", + }, + Expected: allResults, + }, + { + Name: "dangling slash in keyword", + SearchOptions: &internal.SearchOptions{ + Keyword: "\\", + }, + Expected: allResults, + }, + { + Name: "dangling quote in keyword", + SearchOptions: &internal.SearchOptions{ + Keyword: "\"", + }, + Expected: allResults, }, { Name: "empty",