refactored queryString, specs for queryString

This commit is contained in:
Moriz Wahl 2020-03-16 01:36:16 +01:00 committed by mattwr18
parent 46fca229ec
commit b2ea4df294
3 changed files with 78 additions and 37 deletions

View File

@ -1,5 +1,5 @@
import log from './helpers/databaseLogger'
import queryString from './searches/queryString'
import { queryString } from './searches/queryString'
// see http://lucene.apache.org/core/8_3_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package.description

View File

@ -1,41 +1,50 @@
export default function queryString(str) {
// match the whole text exactly
export function queryString(str) {
const normalizedString = normalizeWhitespace(str)
const escapedString = escapeSpecialCharacters(normalizedString)
let result = quoteString(escapedString) + '^8'
// match each word exactly
if (escapedString.includes(' ')) {
result += ' OR ('
escapedString.split(' ').forEach((s, i) => {
result += i === 0 ? quoteString(s) : ' AND ' + quoteString(s)
})
result += ')^4'
}
// match at least one word exactly
if (escapedString.includes(' ')) {
escapedString.split(' ').forEach(s => {
result += ' OR ' + quoteString(s) + '^2'
})
}
// start globbing ...
escapedString.split(' ').forEach(s => {
if (s.length > 3) {
// at least 4 letters. So AND, OR and NOT are never used unquoted
result += ' OR ' + s + '*'
const escapedString = escapeSpecialCharacters(normalizedString)
return `
${matchWholeText(escapedString)}
${matchEachWordExactly(escapedString)}
${matchSomeWordsExactly(escapedString)}
${matchBeginningOfWords(escapedString)}
`
}
const matchWholeText = (str, boost = 8) => {
return `"${str}"^${boost}`
}
const matchEachWordExactly = (str, boost = 4) => {
if (str.includes(' ')) {
let tmp = str.split(' ').map((s, i) => i === 0 ? `"${s}"` : `AND "${s}"`).join(' ')
return `(${tmp})^${boost}`
} else {
return ''
}
})
// now we could become fuzzy using ~
return result
}
const normalizeWhitespace = str => {
return str.replace(/\s+/g, ' ')
const matchSomeWordsExactly = (str, boost = 2) => {
if (str.includes(' ')) {
return str.split(' ').map(s => `"${s}"^${boost}`).join(' ')
} else {
return ''
}
}
const escapeSpecialCharacters = str => {
const matchBeginningOfWords = str => {
return normalizeWhitespace(str.split(' ').map(s => {
if (s.length > 3) {
// at least 4 letters. So AND, OR and NOT are never used unquoted
return s + '*'
} else {
return ''
}
}).join(' '))
}
export function normalizeWhitespace(str) {
return str.replace(/\s+/g, ' ').trim()
}
export function escapeSpecialCharacters(str) {
return str.replace(/(["[\]&|\\{}+!()^~*?:/-])/g, '\\$1')
}
const quoteString = str => {
return '"' + str + '"'
}

View File

@ -1,10 +1,42 @@
import queryString from './queryString'
import { queryString, escapeSpecialCharacters, normalizeWhitespace } from './queryString'
describe('queryString', () => {
describe('special characters', () => {
it('does escaping correctly', () => {
expect(escapeSpecialCharacters('+ - && || ! ( ) { } [ ] ^ " ~ * ? : \\ / '))
.toEqual('\\+ \\- \\&\\& \\|\\| \\! \\( \\) \\{ \\} \\[ \\] \\^ \\" \\~ \\* \\? \\: \\\\ \\/ ')
})
})
describe('whitespace', () => {
it('is normalized correctly', () => {
expect(normalizeWhitespace(' a \t \n b \n '))
.toEqual('a b')
})
})
describe('exact match', () => {
it.skip('boosts score by factor 8', () => {
it('boosts score by factor 8', () => {
expect(queryString('a couple of words')).toContain('"a couple of words"^8')
})
it.todo('implement more cases here')
})
describe('match all words exactly', () => {
it('boosts score by factor 4', () => {
expect(queryString('a couple of words')).toContain('("a" AND "couple" AND "of" AND "words")^4')
})
})
describe('match at least one word exactly', () => {
it('boosts score by factor 2', () => {
expect(queryString('a couple of words')).toContain('"a"^2 "couple"^2 "of"^2 "words"^2')
})
})
describe('globbing for longer words', () => {
it('globs words with more than three characters', () => {
expect(queryString('a couple of words')).toContain('couple* words*')
})
})
})