fix(lib): improve markdown processing and truncation

- Add convertNakedUrls() to skip URLs already inside markdown links
- Rewrite truncateMarkdown() with token-aware truncation
- Add @tiptap/markdown support to VideoEmbed, ItemMention, Hashtag
- Fix double-conversion of URLs in existing links
- Fix truncation cutting tokens in the middle
- Fix eslint warnings with proper types

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Anton Tranelis 2026-01-15 08:50:32 +01:00
parent 3a8477f863
commit 3d7307b759
4 changed files with 286 additions and 67 deletions

View File

@ -9,11 +9,7 @@ import { useGetItemColor } from '#components/Map/hooks/useItemColor'
import { useItems } from '#components/Map/hooks/useItems'
import { useTags } from '#components/Map/hooks/useTags'
import { Hashtag, ItemMention, VideoEmbed } from '#components/TipTap/extensions'
import {
preprocessMarkdown,
removeMarkdownSyntax,
truncateMarkdown,
} from '#components/TipTap/utils/preprocessMarkdown'
import { removeMarkdownSyntax, truncateMarkdown } from '#components/TipTap/utils/preprocessMarkdown'
import type { Item } from '#types/Item'
@ -80,8 +76,9 @@ export const TextView = ({
}),
VideoEmbed,
],
// Preprocess markdown to convert hashtags and item mentions to HTML
content: preprocessMarkdown(innerText),
// Load content as markdown - the extensions' markdownTokenizer handles parsing
content: innerText,
contentType: 'markdown',
editable: false,
editorProps: {
attributes: {
@ -94,7 +91,7 @@ export const TextView = ({
// Update content when text changes
useEffect(() => {
editor.commands.setContent(preprocessMarkdown(innerText))
editor.commands.setContent(innerText, { contentType: 'markdown' })
}, [editor, innerText])
// Handle link clicks for internal navigation

View File

@ -5,11 +5,7 @@ import { useAddFilterTag } from '#components/Map/hooks/useFilter'
import { useGetItemColor } from '#components/Map/hooks/useItemColor'
import { useItems } from '#components/Map/hooks/useItems'
import { useTags } from '#components/Map/hooks/useTags'
import {
preprocessMarkdown,
removeMarkdownSyntax,
truncateMarkdown,
} from '#components/TipTap/utils/preprocessMarkdown'
import { preprocessMarkdown, truncateMarkdown } from '#components/TipTap/utils/preprocessMarkdown'
import { simpleMarkdownToHtml } from '#components/TipTap/utils/simpleMarkdownToHtml'
import type { Item } from '#types/Item'
@ -60,17 +56,22 @@ export const TextViewStatic = ({
innerText = text
}
// Apply truncation if needed
if (innerText && truncate) {
innerText = truncateMarkdown(removeMarkdownSyntax(innerText), 100)
}
// Pre-process and convert to HTML
// Pre-process markdown first (converts naked URLs to links, etc.)
// Then truncate the processed markdown
// Finally convert to HTML
const html = useMemo(() => {
if (!innerText) return ''
const processed = preprocessMarkdown(innerText)
// First preprocess to normalize all URLs/mentions/hashtags
let processed = preprocessMarkdown(innerText)
// Then truncate if needed (works on normalized markdown)
if (truncate) {
processed = truncateMarkdown(processed, 100)
}
return simpleMarkdownToHtml(processed, tags, { items, getItemColor })
}, [innerText, tags, items, getItemColor])
}, [innerText, truncate, tags, items, getItemColor])
// Handle clicks for internal navigation and hashtags
useEffect(() => {

View File

@ -55,28 +55,81 @@ export const VideoEmbed = Node.create<VideoEmbedOptions>({
}
},
addStorage() {
// Markdown tokenizer for @tiptap/markdown - recognizes <https://youtube.com/...> and <https://rumble.com/...> syntax
markdownTokenizer: {
name: 'videoEmbed',
level: 'inline',
// Fast hint for the lexer - where might a video embed start?
start: (src: string) => {
// Look for autolinks with video URLs
const youtubeIndex = src.indexOf('<https://www.youtube.com/watch')
const youtubeShortIndex = src.indexOf('<https://youtu.be/')
const rumbleIndex = src.indexOf('<https://rumble.com/embed/')
const indices = [youtubeIndex, youtubeShortIndex, rumbleIndex].filter((i) => i >= 0)
return indices.length > 0 ? Math.min(...indices) : -1
},
tokenize: (src: string) => {
// Match YouTube autolinks: <https://www.youtube.com/watch?v=VIDEO_ID>
let match = /^<https?:\/\/(?:www\.)?youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})[^>]*>/.exec(
src,
)
if (match) {
return {
type: 'videoEmbed',
raw: match[0],
provider: 'youtube',
videoId: match[1],
}
}
// Match YouTube short autolinks: <https://youtu.be/VIDEO_ID>
match = /^<https?:\/\/youtu\.be\/([a-zA-Z0-9_-]{11})[^>]*>/.exec(src)
if (match) {
return {
type: 'videoEmbed',
raw: match[0],
provider: 'youtube',
videoId: match[1],
}
}
// Match Rumble autolinks: <https://rumble.com/embed/VIDEO_ID>
match = /^<https?:\/\/rumble\.com\/embed\/([a-zA-Z0-9]+)[^>]*>/.exec(src)
if (match) {
return {
type: 'videoEmbed',
raw: match[0],
provider: 'rumble',
videoId: match[1],
}
}
return undefined
},
},
// Parse Markdown token to Tiptap JSON
parseMarkdown(token: { provider: string; videoId: string }) {
return {
markdown: {
serialize(
state: { write: (text: string) => void },
node: { attrs: { provider: string; videoId: string } },
) {
const { provider, videoId } = node.attrs
const url =
provider === 'youtube'
? `https://www.youtube.com/watch?v=${videoId}`
: `https://rumble.com/embed/${videoId}`
// Write as markdown autolink
state.write(`<${url}>`)
},
parse: {
// Parsing is handled by preprocessVideoLinks
},
type: 'videoEmbed',
attrs: {
provider: token.provider,
videoId: token.videoId,
},
}
},
// Serialize Tiptap node to Markdown
renderMarkdown(node: { attrs: { provider: string; videoId: string } }) {
const { provider, videoId } = node.attrs
const url =
provider === 'youtube'
? `https://www.youtube.com/watch?v=${videoId}`
: `https://rumble.com/embed/${videoId}`
return `<${url}>`
},
addAttributes() {
return {
provider: {

View File

@ -4,6 +4,68 @@ import { fixUrls, mailRegex } from '#utils/ReplaceURLs'
import type { JSONContent, Extensions } from '@tiptap/core'
/**
* Converts naked URLs to markdown links, but skips URLs that are already
* inside markdown link syntax [text](url) or autolinks <url>.
*/
function convertNakedUrls(text: string): string {
// Find all existing markdown links and autolinks to know which ranges to skip
const skipRanges: { start: number; end: number }[] = []
// Find markdown links: [text](url)
const linkRegex = /\[[^\]]*\]\([^)]+\)/g
let linkMatch: RegExpExecArray | null
while ((linkMatch = linkRegex.exec(text)) !== null) {
skipRanges.push({ start: linkMatch.index, end: linkMatch.index + linkMatch[0].length })
}
// Find autolinks: <url>
const autolinkRegex = /<https?:\/\/[^>]+>/g
let autolinkMatch: RegExpExecArray | null
while ((autolinkMatch = autolinkRegex.exec(text)) !== null) {
skipRanges.push({
start: autolinkMatch.index,
end: autolinkMatch.index + autolinkMatch[0].length,
})
}
// Now find naked URLs and convert only those not in skip ranges
const urlRegex = /https?:\/\/[^\s)<>\]]+/g
let result = ''
let lastIndex = 0
let urlMatch: RegExpExecArray | null
while ((urlMatch = urlRegex.exec(text)) !== null) {
const urlStart = urlMatch.index
const urlEnd = urlMatch.index + urlMatch[0].length
const url = urlMatch[0]
// Check if this URL is inside a skip range
const isInsideSkipRange = skipRanges.some(
(range) => urlStart >= range.start && urlEnd <= range.end,
)
if (isInsideSkipRange) {
// Keep the URL as-is (it's already part of a link)
continue
}
// Add text before this URL
result += text.slice(lastIndex, urlStart)
// Convert naked URL to markdown link
const displayText = url.replace(/^https?:\/\/(www\.)?/, '')
result += `[${displayText}](${url})`
lastIndex = urlEnd
}
// Add remaining text
result += text.slice(lastIndex)
return result
}
/**
* Converts pre-processed markdown/HTML to TipTap JSON format.
* Creates a temporary editor instance to parse the content.
@ -38,11 +100,9 @@ export function preprocessMarkdown(text: string): string {
result = fixUrls(result)
// 2. Convert naked URLs to markdown links
// Match URLs that are NOT already inside markdown link syntax
result = result.replace(
/(?<!\]?\()(?<!<)https?:\/\/[^\s)]+(?!\))(?!>)/g,
(url) => `[${url.replace(/https?:\/\/w{3}\./gi, '')}](${url})`,
)
// Skip URLs that are already inside markdown link syntax [text](url) or autolinks <url>
// Process the text in segments to avoid matching URLs inside existing links
result = convertNakedUrls(result)
// 3. Convert email addresses to mailto links
result = result.replace(mailRegex, (email) => `[${email}](mailto:${email})`)
@ -148,20 +208,30 @@ export function preprocessItemMentions(text: string): string {
/**
* Removes markdown syntax for plain text display (used for truncation calculation).
* Preserves @mentions ([@Label](/item/id)) and #hashtags for rendering.
*/
export function removeMarkdownSyntax(text: string): string {
return text
.replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
.replace(/(`{1,3})(.*?)\1/g, '$2') // Remove inline code
.replace(/(\*{1,2}|_{1,2})(.*?)\1/g, '$2') // Remove bold and italic
.replace(/(#+)\s+(.*)/g, '$2') // Remove headers
.replace(/>\s+(.*)/g, '$1') // Remove blockquotes
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // Remove links, keep text
.replace(/<[^>]+>/g, '') // Remove HTML tags
return (
text
.replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
.replace(/(`{1,3})(.*?)\1/g, '$2') // Remove inline code
.replace(/(\*{1,2}|_{1,2})(.*?)\1/g, '$2') // Remove bold and italic
.replace(/(#+)\s+(.*)/g, '$2') // Remove headers
.replace(/>\s+(.*)/g, '$1') // Remove blockquotes
// Remove regular links but preserve @mentions ([@Label](/item/...))
.replace(/\[([^\]]+)\]\((?!\/item\/)[^)]+\)/g, '$1')
.replace(/<[^>]+>/g, '')
) // Remove HTML tags
}
/**
* Truncates text to a character limit, respecting paragraph boundaries.
* Truncates text to a character limit based on visible/plain text length.
* Preserves complete tokens - won't cut in the middle of:
* - @mentions: [@Label](/item/id)
* - #hashtags: #tagname
* - Links: [text](url)
*
* The limit applies to the rendered/visible text, not the raw markdown.
*/
export function truncateMarkdown(text: string, limit: number): string {
const plainText = removeMarkdownSyntax(text)
@ -170,26 +240,124 @@ export function truncateMarkdown(text: string, limit: number): string {
return text
}
let truncated = ''
let length = 0
// Tokenize the text into segments: either special tokens or plain text
// This allows us to count visible characters correctly
// Order matters: more specific patterns first
const tokenPatterns = [
{ pattern: /\[@([^\]]+?)\]\(\/?item\/[^)]+\)/g, type: 'mention' }, // @mentions - visible: @label
{ pattern: /<https?:\/\/[^>]+>/g, type: 'autolink' }, // <url> autolinks - visible: the whole thing (for videos etc)
{ pattern: /\[([^\]]*)\]\([^)]+\)/g, type: 'link' }, // [text](url) - visible: text
{ pattern: /(?<!\(|<)https?:\/\/[^\s)<>]+/g, type: 'nakedurl' }, // naked URLs - visible: URL without protocol
{ pattern: /(?<!\[)#([a-zA-Z0-9À-ÖØ-öø-ʸ_-]+)/g, type: 'hashtag' }, // #tag - visible: #tag (not inside links)
]
const paragraphs = text.split('\n')
// Find all tokens with their positions
interface Token {
start: number
end: number
raw: string
visible: string
type: string
}
for (const paragraph of paragraphs) {
const plainParagraph = removeMarkdownSyntax(paragraph)
const tokens: Token[] = []
if (length + plainParagraph.length > limit) {
// Calculate how many chars we can take from this paragraph
const remaining = limit - length
if (remaining > 0) {
truncated += paragraph.slice(0, remaining) + '...'
for (const { pattern, type } of tokenPatterns) {
pattern.lastIndex = 0
let match: RegExpExecArray | null
while ((match = pattern.exec(text)) !== null) {
const matchIndex = match.index
const matchFull = match[0]
const matchGroup = match[1] || ''
let visible: string
if (type === 'mention') {
visible = '@' + matchGroup
} else if (type === 'link') {
visible = matchGroup
} else if (type === 'autolink') {
// Autolinks like <https://youtube.com/...> - for truncation, count as short placeholder
// since they'll be rendered as embeds or converted
visible = '[video]'
} else if (type === 'nakedurl') {
// Naked URLs will be converted to links by preprocessMarkdown
// The visible text will be the URL without https://www.
visible = matchFull.replace(/^https?:\/\/(www\.)?/, '')
} else {
visible = matchFull // hashtag includes the #
}
// Check if this position overlaps with existing tokens (avoid duplicates)
const overlaps = tokens.some(
(t) =>
(matchIndex >= t.start && matchIndex < t.end) ||
(matchIndex + matchFull.length > t.start && matchIndex + matchFull.length <= t.end),
)
if (!overlaps) {
tokens.push({
start: matchIndex,
end: matchIndex + matchFull.length,
raw: matchFull,
visible,
type,
})
}
break
} else {
truncated += paragraph + '\n'
length += plainParagraph.length
}
}
return truncated.trim()
// Sort tokens by position
tokens.sort((a, b) => a.start - b.start)
// Build truncated output by walking through text
let result = ''
let visibleLength = 0
let pos = 0
while (pos < text.length && visibleLength < limit) {
// Check if we're at a token
const token = tokens.find((t) => t.start === pos)
if (token) {
// Would this token exceed the limit?
if (visibleLength + token.visible.length > limit) {
// Don't include partial token - stop here
break
}
result += token.raw
visibleLength += token.visible.length
pos = token.end
} else {
// Check if next position is inside a token (shouldn't happen, but safety check)
const insideToken = tokens.find((t) => pos > t.start && pos < t.end)
if (insideToken) {
pos = insideToken.end
continue
}
// Regular character - check for newline
// eslint-disable-next-line security/detect-object-injection
const char = text[pos]
if (char === '\n') {
result += char
pos++
// Don't count newlines toward visible limit
} else {
// Would this char exceed limit?
if (visibleLength + 1 > limit) {
break
}
result += char
visibleLength++
pos++
}
}
}
// Add ellipsis if we truncated
if (pos < text.length) {
result = result.trimEnd() + '...'
}
return result.trim()
}