mirror of
https://github.com/IT4Change/Ocelot-Social.git
synced 2025-12-13 07:45:56 +00:00
Refactor the extraction of hashtag out of a posts content
Co-Authored-By: mattwr18 <mattwr18@gmail.com>
This commit is contained in:
parent
0bc4c558ae
commit
327105da7a
@ -1,17 +1,21 @@
|
||||
import cheerio from 'cheerio'
|
||||
import {
|
||||
exec,
|
||||
build
|
||||
} from 'xregexp/xregexp-all.js'
|
||||
// formats of a Hashtag:
|
||||
// https://en.wikipedia.org/w/index.php?title=Hashtag&oldid=905141980#Style
|
||||
// here:
|
||||
// 0. Search for whole string.
|
||||
// 1. Hashtag has only 'a-z', 'A-Z', and '0-9'.
|
||||
// 1. Hashtag has only all unicode characters and '0-9'.
|
||||
// 2. If it starts with a digit '0-9' than 'a-z', or 'A-Z' has to follow.
|
||||
const ID_REGEX = /^\/search\/hashtag\/((\p{L}+[\p{L}0-9]*)|([0-9]+\p{L}+[\p{L}0-9]*))$/g
|
||||
const regX = build('^\/search\/hashtag\/((\\pL+[\\pL0-9]*)|([0-9]+\\pL+[\\pL0-9]*))$')
|
||||
|
||||
export default function (content) {
|
||||
if (!content) return []
|
||||
const $ = cheerio.load(content)
|
||||
// We can not search for class '.hashtag', because the classes are removed at the 'xss' middleware.
|
||||
// But we have to know, which Hashtags are removed from the content es well, so we search for the 'a' html-tag.
|
||||
// But we have to know, which Hashtags are removed from the content as well, so we search for the 'a' html-tag.
|
||||
const urls = $('a')
|
||||
.map((_, el) => {
|
||||
return $(el).attr('href')
|
||||
@ -20,7 +24,7 @@ export default function (content) {
|
||||
const hashtags = []
|
||||
urls.forEach(url => {
|
||||
let match
|
||||
while ((match = ID_REGEX.exec(url)) != null) {
|
||||
if ((match = exec(url, regX)) != null) {
|
||||
hashtags.push(match[1])
|
||||
}
|
||||
})
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user