Refactor the extraction of hashtag out of a posts content

Co-Authored-By: mattwr18 <mattwr18@gmail.com>
This commit is contained in:
Wolfgang Huß 2019-08-29 19:02:44 +02:00
parent 0bc4c558ae
commit 327105da7a

View File

@ -1,17 +1,21 @@
import cheerio from 'cheerio' import cheerio from 'cheerio'
import {
exec,
build
} from 'xregexp/xregexp-all.js'
// formats of a Hashtag: // formats of a Hashtag:
// https://en.wikipedia.org/w/index.php?title=Hashtag&oldid=905141980#Style // https://en.wikipedia.org/w/index.php?title=Hashtag&oldid=905141980#Style
// here: // here:
// 0. Search for whole string. // 0. Search for whole string.
// 1. Hashtag has only 'a-z', 'A-Z', and '0-9'. // 1. Hashtag has only all unicode characters and '0-9'.
// 2. If it starts with a digit '0-9' than 'a-z', or 'A-Z' has to follow. // 2. If it starts with a digit '0-9' than 'a-z', or 'A-Z' has to follow.
const ID_REGEX = /^\/search\/hashtag\/((\p{L}+[\p{L}0-9]*)|([0-9]+\p{L}+[\p{L}0-9]*))$/g const regX = build('^\/search\/hashtag\/((\\pL+[\\pL0-9]*)|([0-9]+\\pL+[\\pL0-9]*))$')
export default function (content) { export default function (content) {
if (!content) return [] if (!content) return []
const $ = cheerio.load(content) const $ = cheerio.load(content)
// We can not search for class '.hashtag', because the classes are removed at the 'xss' middleware. // We can not search for class '.hashtag', because the classes are removed at the 'xss' middleware.
// But we have to know, which Hashtags are removed from the content es well, so we search for the 'a' html-tag. // But we have to know, which Hashtags are removed from the content as well, so we search for the 'a' html-tag.
const urls = $('a') const urls = $('a')
.map((_, el) => { .map((_, el) => {
return $(el).attr('href') return $(el).attr('href')
@ -20,7 +24,7 @@ export default function (content) {
const hashtags = [] const hashtags = []
urls.forEach(url => { urls.forEach(url => {
let match let match
while ((match = ID_REGEX.exec(url)) != null) { if ((match = exec(url, regX)) != null) {
hashtags.push(match[1]) hashtags.push(match[1])
} }
}) })