From b2ea4df2947cf116673fa4d662cd34ad60ce8038 Mon Sep 17 00:00:00 2001 From: Moriz Wahl Date: Mon, 16 Mar 2020 01:36:16 +0100 Subject: [PATCH] refactored queryString, specs for queryString --- backend/src/schema/resolvers/searches.js | 2 +- .../schema/resolvers/searches/queryString.js | 75 +++++++++++-------- .../resolvers/searches/queryString.spec.js | 38 +++++++++- 3 files changed, 78 insertions(+), 37 deletions(-) diff --git a/backend/src/schema/resolvers/searches.js b/backend/src/schema/resolvers/searches.js index ba67bb2d0..5e062f45a 100644 --- a/backend/src/schema/resolvers/searches.js +++ b/backend/src/schema/resolvers/searches.js @@ -1,5 +1,5 @@ import log from './helpers/databaseLogger' -import queryString from './searches/queryString' +import { queryString } from './searches/queryString' // see http://lucene.apache.org/core/8_3_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package.description diff --git a/backend/src/schema/resolvers/searches/queryString.js b/backend/src/schema/resolvers/searches/queryString.js index 6735b54c0..3e53bfec9 100644 --- a/backend/src/schema/resolvers/searches/queryString.js +++ b/backend/src/schema/resolvers/searches/queryString.js @@ -1,41 +1,50 @@ -export default function queryString(str) { - // match the whole text exactly +export function queryString(str) { const normalizedString = normalizeWhitespace(str) - const escapedString = escapeSpecialCharacters(normalizedString) - let result = quoteString(escapedString) + '^8' - // match each word exactly - if (escapedString.includes(' ')) { - result += ' OR (' - escapedString.split(' ').forEach((s, i) => { - result += i === 0 ? quoteString(s) : ' AND ' + quoteString(s) - }) - result += ')^4' - } - // match at least one word exactly - if (escapedString.includes(' ')) { - escapedString.split(' ').forEach(s => { - result += ' OR ' + quoteString(s) + '^2' - }) - } - // start globbing ... - escapedString.split(' ').forEach(s => { - if (s.length > 3) { - // at least 4 letters. So AND, OR and NOT are never used unquoted - result += ' OR ' + s + '*' + const escapedString = escapeSpecialCharacters(normalizedString) + return ` +${matchWholeText(escapedString)} +${matchEachWordExactly(escapedString)} +${matchSomeWordsExactly(escapedString)} +${matchBeginningOfWords(escapedString)} +` +} + +const matchWholeText = (str, boost = 8) => { + return `"${str}"^${boost}` +} + +const matchEachWordExactly = (str, boost = 4) => { + if (str.includes(' ')) { + let tmp = str.split(' ').map((s, i) => i === 0 ? `"${s}"` : `AND "${s}"`).join(' ') + return `(${tmp})^${boost}` + } else { + return '' } - }) - // now we could become fuzzy using ~ - return result } -const normalizeWhitespace = str => { - return str.replace(/\s+/g, ' ') +const matchSomeWordsExactly = (str, boost = 2) => { + if (str.includes(' ')) { + return str.split(' ').map(s => `"${s}"^${boost}`).join(' ') + } else { + return '' + } } -const escapeSpecialCharacters = str => { +const matchBeginningOfWords = str => { + return normalizeWhitespace(str.split(' ').map(s => { + if (s.length > 3) { + // at least 4 letters. So AND, OR and NOT are never used unquoted + return s + '*' + } else { + return '' + } + }).join(' ')) +} + +export function normalizeWhitespace(str) { + return str.replace(/\s+/g, ' ').trim() +} + +export function escapeSpecialCharacters(str) { return str.replace(/(["[\]&|\\{}+!()^~*?:/-])/g, '\\$1') } - -const quoteString = str => { - return '"' + str + '"' -} diff --git a/backend/src/schema/resolvers/searches/queryString.spec.js b/backend/src/schema/resolvers/searches/queryString.spec.js index c5133b631..e431df90f 100644 --- a/backend/src/schema/resolvers/searches/queryString.spec.js +++ b/backend/src/schema/resolvers/searches/queryString.spec.js @@ -1,10 +1,42 @@ -import queryString from './queryString' +import { queryString, escapeSpecialCharacters, normalizeWhitespace } from './queryString' describe('queryString', () => { + describe('special characters', () => { + it('does escaping correctly', () => { + expect(escapeSpecialCharacters('+ - && || ! ( ) { } [ ] ^ " ~ * ? : \\ / ')) + .toEqual('\\+ \\- \\&\\& \\|\\| \\! \\( \\) \\{ \\} \\[ \\] \\^ \\" \\~ \\* \\? \\: \\\\ \\/ ') + }) + }) + + describe('whitespace', () => { + it('is normalized correctly', () => { + expect(normalizeWhitespace(' a \t \n b \n ')) + .toEqual('a b') + }) + }) + describe('exact match', () => { - it.skip('boosts score by factor 8', () => { + it('boosts score by factor 8', () => { expect(queryString('a couple of words')).toContain('"a couple of words"^8') }) - it.todo('implement more cases here') }) + + describe('match all words exactly', () => { + it('boosts score by factor 4', () => { + expect(queryString('a couple of words')).toContain('("a" AND "couple" AND "of" AND "words")^4') + }) + }) + + describe('match at least one word exactly', () => { + it('boosts score by factor 2', () => { + expect(queryString('a couple of words')).toContain('"a"^2 "couple"^2 "of"^2 "words"^2') + }) + }) + + describe('globbing for longer words', () => { + it('globs words with more than three characters', () => { + expect(queryString('a couple of words')).toContain('couple* words*') + }) + }) + })