diff --git a/backend/src/middleware/hashtags/extractHashtags.js b/backend/src/middleware/hashtags/extractHashtags.js index d13511e53..04c0322c6 100644 --- a/backend/src/middleware/hashtags/extractHashtags.js +++ b/backend/src/middleware/hashtags/extractHashtags.js @@ -1,17 +1,21 @@ import cheerio from 'cheerio' +import { + exec, + build +} from 'xregexp/xregexp-all.js' // formats of a Hashtag: // https://en.wikipedia.org/w/index.php?title=Hashtag&oldid=905141980#Style // here: // 0. Search for whole string. -// 1. Hashtag has only 'a-z', 'A-Z', and '0-9'. +// 1. Hashtag has only all unicode characters and '0-9'. // 2. If it starts with a digit '0-9' than 'a-z', or 'A-Z' has to follow. -const ID_REGEX = /^\/search\/hashtag\/((\p{L}+[\p{L}0-9]*)|([0-9]+\p{L}+[\p{L}0-9]*))$/g +const regX = build('^\/search\/hashtag\/((\\pL+[\\pL0-9]*)|([0-9]+\\pL+[\\pL0-9]*))$') export default function (content) { if (!content) return [] const $ = cheerio.load(content) // We can not search for class '.hashtag', because the classes are removed at the 'xss' middleware. - // But we have to know, which Hashtags are removed from the content es well, so we search for the 'a' html-tag. + // But we have to know, which Hashtags are removed from the content as well, so we search for the 'a' html-tag. const urls = $('a') .map((_, el) => { return $(el).attr('href') @@ -20,7 +24,7 @@ export default function (content) { const hashtags = [] urls.forEach(url => { let match - while ((match = ID_REGEX.exec(url)) != null) { + if ((match = exec(url, regX)) != null) { hashtags.push(match[1]) } })