From b2ea4df2947cf116673fa4d662cd34ad60ce8038 Mon Sep 17 00:00:00 2001
From: Moriz Wahl <moriz.wahl@gmx.de>
Date: Mon, 16 Mar 2020 01:36:16 +0100
Subject: [PATCH] refactored queryString, specs for queryString

---
 backend/src/schema/resolvers/searches.js      |  2 +-
 .../schema/resolvers/searches/queryString.js  | 75 +++++++++++--------
 .../resolvers/searches/queryString.spec.js    | 38 +++++++++-
 3 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/backend/src/schema/resolvers/searches.js b/backend/src/schema/resolvers/searches.js
index ba67bb2d0..5e062f45a 100644
--- a/backend/src/schema/resolvers/searches.js
+++ b/backend/src/schema/resolvers/searches.js
@@ -1,5 +1,5 @@
 import log from './helpers/databaseLogger'
-import queryString from './searches/queryString'
+import { queryString } from './searches/queryString'
 
 // see http://lucene.apache.org/core/8_3_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package.description
 
diff --git a/backend/src/schema/resolvers/searches/queryString.js b/backend/src/schema/resolvers/searches/queryString.js
index 6735b54c0..3e53bfec9 100644
--- a/backend/src/schema/resolvers/searches/queryString.js
+++ b/backend/src/schema/resolvers/searches/queryString.js
@@ -1,41 +1,50 @@
-export default function queryString(str) {
-  // match the whole text exactly
+export function queryString(str) {
   const normalizedString = normalizeWhitespace(str)
-  const escapedString = escapeSpecialCharacters(normalizedString)
-  let result = quoteString(escapedString) + '^8'
-  // match each word exactly
-  if (escapedString.includes(' ')) {
-    result += ' OR ('
-    escapedString.split(' ').forEach((s, i) => {
-      result += i === 0 ? quoteString(s) : ' AND ' + quoteString(s)
-    })
-    result += ')^4'
-  }
-  // match at least one word exactly
-  if (escapedString.includes(' ')) {
-    escapedString.split(' ').forEach(s => {
-      result += ' OR ' + quoteString(s) + '^2'
-    })
-  }
-  // start globbing ...
-  escapedString.split(' ').forEach(s => {
-    if (s.length > 3) {
-      // at least 4 letters. So AND, OR and NOT are never used unquoted
-      result += ' OR ' + s + '*'
+    const escapedString = escapeSpecialCharacters(normalizedString)
+    return `
+${matchWholeText(escapedString)}
+${matchEachWordExactly(escapedString)}
+${matchSomeWordsExactly(escapedString)}
+${matchBeginningOfWords(escapedString)}
+`
+}
+
+const matchWholeText = (str, boost = 8) => {
+    return `"${str}"^${boost}`
+}
+
+const matchEachWordExactly = (str, boost = 4) => {
+    if (str.includes(' ')) {
+	let tmp = str.split(' ').map((s, i) => i === 0 ? `"${s}"` : `AND "${s}"`).join(' ')
+	return `(${tmp})^${boost}`
+    } else {
+	return ''
     }
-  })
-  // now we could become fuzzy using ~
-  return result
 }
 
-const normalizeWhitespace = str => {
-  return str.replace(/\s+/g, ' ')
+const matchSomeWordsExactly = (str, boost = 2) => {
+    if (str.includes(' ')) {
+	return str.split(' ').map(s => `"${s}"^${boost}`).join(' ')
+    } else {
+	return ''
+    }    
 }
 
-const escapeSpecialCharacters = str => {
+const matchBeginningOfWords = str => {
+    return normalizeWhitespace(str.split(' ').map(s => {
+	if (s.length > 3) {
+	    // at least 4 letters. So AND, OR and NOT are never used unquoted
+	    return s + '*'
+	} else {
+	    return ''
+	}
+    }).join(' '))
+}
+
+export function normalizeWhitespace(str) {
+    return str.replace(/\s+/g, ' ').trim()
+}
+
+export function escapeSpecialCharacters(str) {
   return str.replace(/(["[\]&|\\{}+!()^~*?:/-])/g, '\\$1')
 }
-
-const quoteString = str => {
-  return '"' + str + '"'
-}
diff --git a/backend/src/schema/resolvers/searches/queryString.spec.js b/backend/src/schema/resolvers/searches/queryString.spec.js
index c5133b631..e431df90f 100644
--- a/backend/src/schema/resolvers/searches/queryString.spec.js
+++ b/backend/src/schema/resolvers/searches/queryString.spec.js
@@ -1,10 +1,42 @@
-import queryString from './queryString'
+import { queryString, escapeSpecialCharacters, normalizeWhitespace } from './queryString'
 
 describe('queryString', () => {
+    describe('special characters', () => {
+	it('does escaping correctly', () => {
+	    expect(escapeSpecialCharacters('+ - && || ! ( ) { } [ ] ^ " ~ * ? : \\ / '))
+		.toEqual('\\+ \\- \\&\\& \\|\\| \\! \\( \\) \\{ \\} \\[ \\] \\^ \\" \\~ \\* \\? \\: \\\\ \\/ ')
+	})
+    })
+
+    describe('whitespace', () => {
+	it('is normalized correctly', () => {
+	    expect(normalizeWhitespace(' a \t \n b \n   '))
+		.toEqual('a b')
+	})
+    })
+    
   describe('exact match', () => {
-    it.skip('boosts score by factor 8', () => {
+    it('boosts score by factor 8', () => {
       expect(queryString('a couple of words')).toContain('"a couple of words"^8')
     })
-    it.todo('implement more cases here')
   })
+
+  describe('match all words exactly', () => {
+    it('boosts score by factor 4', () => {
+      expect(queryString('a couple of words')).toContain('("a" AND "couple" AND "of" AND "words")^4')
+    })
+  })
+
+  describe('match at least one word exactly', () => {
+    it('boosts score by factor 2', () => {
+      expect(queryString('a couple of words')).toContain('"a"^2 "couple"^2 "of"^2 "words"^2')
+    })
+  })
+
+ describe('globbing for longer words', () => {
+    it('globs words with more than three characters', () => {
+      expect(queryString('a couple of words')).toContain('couple* words*')
+    })
+  })
+
 })