mirror of
https://github.com/utopia-os/utopia-ui.git
synced 2026-02-06 09:55:47 +00:00
fix(lib): improve markdown processing and truncation
- Add convertNakedUrls() to skip URLs already inside markdown links - Rewrite truncateMarkdown() with token-aware truncation - Add @tiptap/markdown support to VideoEmbed, ItemMention, Hashtag - Fix double-conversion of URLs in existing links - Fix truncation cutting tokens in the middle - Fix eslint warnings with proper types Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
3a8477f863
commit
3d7307b759
@ -9,11 +9,7 @@ import { useGetItemColor } from '#components/Map/hooks/useItemColor'
|
||||
import { useItems } from '#components/Map/hooks/useItems'
|
||||
import { useTags } from '#components/Map/hooks/useTags'
|
||||
import { Hashtag, ItemMention, VideoEmbed } from '#components/TipTap/extensions'
|
||||
import {
|
||||
preprocessMarkdown,
|
||||
removeMarkdownSyntax,
|
||||
truncateMarkdown,
|
||||
} from '#components/TipTap/utils/preprocessMarkdown'
|
||||
import { removeMarkdownSyntax, truncateMarkdown } from '#components/TipTap/utils/preprocessMarkdown'
|
||||
|
||||
import type { Item } from '#types/Item'
|
||||
|
||||
@ -80,8 +76,9 @@ export const TextView = ({
|
||||
}),
|
||||
VideoEmbed,
|
||||
],
|
||||
// Preprocess markdown to convert hashtags and item mentions to HTML
|
||||
content: preprocessMarkdown(innerText),
|
||||
// Load content as markdown - the extensions' markdownTokenizer handles parsing
|
||||
content: innerText,
|
||||
contentType: 'markdown',
|
||||
editable: false,
|
||||
editorProps: {
|
||||
attributes: {
|
||||
@ -94,7 +91,7 @@ export const TextView = ({
|
||||
|
||||
// Update content when text changes
|
||||
useEffect(() => {
|
||||
editor.commands.setContent(preprocessMarkdown(innerText))
|
||||
editor.commands.setContent(innerText, { contentType: 'markdown' })
|
||||
}, [editor, innerText])
|
||||
|
||||
// Handle link clicks for internal navigation
|
||||
|
||||
@ -5,11 +5,7 @@ import { useAddFilterTag } from '#components/Map/hooks/useFilter'
|
||||
import { useGetItemColor } from '#components/Map/hooks/useItemColor'
|
||||
import { useItems } from '#components/Map/hooks/useItems'
|
||||
import { useTags } from '#components/Map/hooks/useTags'
|
||||
import {
|
||||
preprocessMarkdown,
|
||||
removeMarkdownSyntax,
|
||||
truncateMarkdown,
|
||||
} from '#components/TipTap/utils/preprocessMarkdown'
|
||||
import { preprocessMarkdown, truncateMarkdown } from '#components/TipTap/utils/preprocessMarkdown'
|
||||
import { simpleMarkdownToHtml } from '#components/TipTap/utils/simpleMarkdownToHtml'
|
||||
|
||||
import type { Item } from '#types/Item'
|
||||
@ -60,17 +56,22 @@ export const TextViewStatic = ({
|
||||
innerText = text
|
||||
}
|
||||
|
||||
// Apply truncation if needed
|
||||
if (innerText && truncate) {
|
||||
innerText = truncateMarkdown(removeMarkdownSyntax(innerText), 100)
|
||||
}
|
||||
|
||||
// Pre-process and convert to HTML
|
||||
// Pre-process markdown first (converts naked URLs to links, etc.)
|
||||
// Then truncate the processed markdown
|
||||
// Finally convert to HTML
|
||||
const html = useMemo(() => {
|
||||
if (!innerText) return ''
|
||||
const processed = preprocessMarkdown(innerText)
|
||||
|
||||
// First preprocess to normalize all URLs/mentions/hashtags
|
||||
let processed = preprocessMarkdown(innerText)
|
||||
|
||||
// Then truncate if needed (works on normalized markdown)
|
||||
if (truncate) {
|
||||
processed = truncateMarkdown(processed, 100)
|
||||
}
|
||||
|
||||
return simpleMarkdownToHtml(processed, tags, { items, getItemColor })
|
||||
}, [innerText, tags, items, getItemColor])
|
||||
}, [innerText, truncate, tags, items, getItemColor])
|
||||
|
||||
// Handle clicks for internal navigation and hashtags
|
||||
useEffect(() => {
|
||||
|
||||
@ -55,28 +55,81 @@ export const VideoEmbed = Node.create<VideoEmbedOptions>({
|
||||
}
|
||||
},
|
||||
|
||||
addStorage() {
|
||||
// Markdown tokenizer for @tiptap/markdown - recognizes <https://youtube.com/...> and <https://rumble.com/...> syntax
|
||||
markdownTokenizer: {
|
||||
name: 'videoEmbed',
|
||||
level: 'inline',
|
||||
// Fast hint for the lexer - where might a video embed start?
|
||||
start: (src: string) => {
|
||||
// Look for autolinks with video URLs
|
||||
const youtubeIndex = src.indexOf('<https://www.youtube.com/watch')
|
||||
const youtubeShortIndex = src.indexOf('<https://youtu.be/')
|
||||
const rumbleIndex = src.indexOf('<https://rumble.com/embed/')
|
||||
|
||||
const indices = [youtubeIndex, youtubeShortIndex, rumbleIndex].filter((i) => i >= 0)
|
||||
return indices.length > 0 ? Math.min(...indices) : -1
|
||||
},
|
||||
tokenize: (src: string) => {
|
||||
// Match YouTube autolinks: <https://www.youtube.com/watch?v=VIDEO_ID>
|
||||
let match = /^<https?:\/\/(?:www\.)?youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})[^>]*>/.exec(
|
||||
src,
|
||||
)
|
||||
if (match) {
|
||||
return {
|
||||
type: 'videoEmbed',
|
||||
raw: match[0],
|
||||
provider: 'youtube',
|
||||
videoId: match[1],
|
||||
}
|
||||
}
|
||||
|
||||
// Match YouTube short autolinks: <https://youtu.be/VIDEO_ID>
|
||||
match = /^<https?:\/\/youtu\.be\/([a-zA-Z0-9_-]{11})[^>]*>/.exec(src)
|
||||
if (match) {
|
||||
return {
|
||||
type: 'videoEmbed',
|
||||
raw: match[0],
|
||||
provider: 'youtube',
|
||||
videoId: match[1],
|
||||
}
|
||||
}
|
||||
|
||||
// Match Rumble autolinks: <https://rumble.com/embed/VIDEO_ID>
|
||||
match = /^<https?:\/\/rumble\.com\/embed\/([a-zA-Z0-9]+)[^>]*>/.exec(src)
|
||||
if (match) {
|
||||
return {
|
||||
type: 'videoEmbed',
|
||||
raw: match[0],
|
||||
provider: 'rumble',
|
||||
videoId: match[1],
|
||||
}
|
||||
}
|
||||
|
||||
return undefined
|
||||
},
|
||||
},
|
||||
|
||||
// Parse Markdown token to Tiptap JSON
|
||||
parseMarkdown(token: { provider: string; videoId: string }) {
|
||||
return {
|
||||
markdown: {
|
||||
serialize(
|
||||
state: { write: (text: string) => void },
|
||||
node: { attrs: { provider: string; videoId: string } },
|
||||
) {
|
||||
const { provider, videoId } = node.attrs
|
||||
const url =
|
||||
provider === 'youtube'
|
||||
? `https://www.youtube.com/watch?v=${videoId}`
|
||||
: `https://rumble.com/embed/${videoId}`
|
||||
// Write as markdown autolink
|
||||
state.write(`<${url}>`)
|
||||
},
|
||||
parse: {
|
||||
// Parsing is handled by preprocessVideoLinks
|
||||
},
|
||||
type: 'videoEmbed',
|
||||
attrs: {
|
||||
provider: token.provider,
|
||||
videoId: token.videoId,
|
||||
},
|
||||
}
|
||||
},
|
||||
|
||||
// Serialize Tiptap node to Markdown
|
||||
renderMarkdown(node: { attrs: { provider: string; videoId: string } }) {
|
||||
const { provider, videoId } = node.attrs
|
||||
const url =
|
||||
provider === 'youtube'
|
||||
? `https://www.youtube.com/watch?v=${videoId}`
|
||||
: `https://rumble.com/embed/${videoId}`
|
||||
return `<${url}>`
|
||||
},
|
||||
|
||||
addAttributes() {
|
||||
return {
|
||||
provider: {
|
||||
|
||||
@ -4,6 +4,68 @@ import { fixUrls, mailRegex } from '#utils/ReplaceURLs'
|
||||
|
||||
import type { JSONContent, Extensions } from '@tiptap/core'
|
||||
|
||||
/**
|
||||
* Converts naked URLs to markdown links, but skips URLs that are already
|
||||
* inside markdown link syntax [text](url) or autolinks <url>.
|
||||
*/
|
||||
function convertNakedUrls(text: string): string {
|
||||
// Find all existing markdown links and autolinks to know which ranges to skip
|
||||
const skipRanges: { start: number; end: number }[] = []
|
||||
|
||||
// Find markdown links: [text](url)
|
||||
const linkRegex = /\[[^\]]*\]\([^)]+\)/g
|
||||
let linkMatch: RegExpExecArray | null
|
||||
while ((linkMatch = linkRegex.exec(text)) !== null) {
|
||||
skipRanges.push({ start: linkMatch.index, end: linkMatch.index + linkMatch[0].length })
|
||||
}
|
||||
|
||||
// Find autolinks: <url>
|
||||
const autolinkRegex = /<https?:\/\/[^>]+>/g
|
||||
let autolinkMatch: RegExpExecArray | null
|
||||
while ((autolinkMatch = autolinkRegex.exec(text)) !== null) {
|
||||
skipRanges.push({
|
||||
start: autolinkMatch.index,
|
||||
end: autolinkMatch.index + autolinkMatch[0].length,
|
||||
})
|
||||
}
|
||||
|
||||
// Now find naked URLs and convert only those not in skip ranges
|
||||
const urlRegex = /https?:\/\/[^\s)<>\]]+/g
|
||||
let result = ''
|
||||
let lastIndex = 0
|
||||
let urlMatch: RegExpExecArray | null
|
||||
|
||||
while ((urlMatch = urlRegex.exec(text)) !== null) {
|
||||
const urlStart = urlMatch.index
|
||||
const urlEnd = urlMatch.index + urlMatch[0].length
|
||||
const url = urlMatch[0]
|
||||
|
||||
// Check if this URL is inside a skip range
|
||||
const isInsideSkipRange = skipRanges.some(
|
||||
(range) => urlStart >= range.start && urlEnd <= range.end,
|
||||
)
|
||||
|
||||
if (isInsideSkipRange) {
|
||||
// Keep the URL as-is (it's already part of a link)
|
||||
continue
|
||||
}
|
||||
|
||||
// Add text before this URL
|
||||
result += text.slice(lastIndex, urlStart)
|
||||
|
||||
// Convert naked URL to markdown link
|
||||
const displayText = url.replace(/^https?:\/\/(www\.)?/, '')
|
||||
result += `[${displayText}](${url})`
|
||||
|
||||
lastIndex = urlEnd
|
||||
}
|
||||
|
||||
// Add remaining text
|
||||
result += text.slice(lastIndex)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts pre-processed markdown/HTML to TipTap JSON format.
|
||||
* Creates a temporary editor instance to parse the content.
|
||||
@ -38,11 +100,9 @@ export function preprocessMarkdown(text: string): string {
|
||||
result = fixUrls(result)
|
||||
|
||||
// 2. Convert naked URLs to markdown links
|
||||
// Match URLs that are NOT already inside markdown link syntax
|
||||
result = result.replace(
|
||||
/(?<!\]?\()(?<!<)https?:\/\/[^\s)]+(?!\))(?!>)/g,
|
||||
(url) => `[${url.replace(/https?:\/\/w{3}\./gi, '')}](${url})`,
|
||||
)
|
||||
// Skip URLs that are already inside markdown link syntax [text](url) or autolinks <url>
|
||||
// Process the text in segments to avoid matching URLs inside existing links
|
||||
result = convertNakedUrls(result)
|
||||
|
||||
// 3. Convert email addresses to mailto links
|
||||
result = result.replace(mailRegex, (email) => `[${email}](mailto:${email})`)
|
||||
@ -148,20 +208,30 @@ export function preprocessItemMentions(text: string): string {
|
||||
|
||||
/**
|
||||
* Removes markdown syntax for plain text display (used for truncation calculation).
|
||||
* Preserves @mentions ([@Label](/item/id)) and #hashtags for rendering.
|
||||
*/
|
||||
export function removeMarkdownSyntax(text: string): string {
|
||||
return text
|
||||
.replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
|
||||
.replace(/(`{1,3})(.*?)\1/g, '$2') // Remove inline code
|
||||
.replace(/(\*{1,2}|_{1,2})(.*?)\1/g, '$2') // Remove bold and italic
|
||||
.replace(/(#+)\s+(.*)/g, '$2') // Remove headers
|
||||
.replace(/>\s+(.*)/g, '$1') // Remove blockquotes
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // Remove links, keep text
|
||||
.replace(/<[^>]+>/g, '') // Remove HTML tags
|
||||
return (
|
||||
text
|
||||
.replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
|
||||
.replace(/(`{1,3})(.*?)\1/g, '$2') // Remove inline code
|
||||
.replace(/(\*{1,2}|_{1,2})(.*?)\1/g, '$2') // Remove bold and italic
|
||||
.replace(/(#+)\s+(.*)/g, '$2') // Remove headers
|
||||
.replace(/>\s+(.*)/g, '$1') // Remove blockquotes
|
||||
// Remove regular links but preserve @mentions ([@Label](/item/...))
|
||||
.replace(/\[([^\]]+)\]\((?!\/item\/)[^)]+\)/g, '$1')
|
||||
.replace(/<[^>]+>/g, '')
|
||||
) // Remove HTML tags
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncates text to a character limit, respecting paragraph boundaries.
|
||||
* Truncates text to a character limit based on visible/plain text length.
|
||||
* Preserves complete tokens - won't cut in the middle of:
|
||||
* - @mentions: [@Label](/item/id)
|
||||
* - #hashtags: #tagname
|
||||
* - Links: [text](url)
|
||||
*
|
||||
* The limit applies to the rendered/visible text, not the raw markdown.
|
||||
*/
|
||||
export function truncateMarkdown(text: string, limit: number): string {
|
||||
const plainText = removeMarkdownSyntax(text)
|
||||
@ -170,26 +240,124 @@ export function truncateMarkdown(text: string, limit: number): string {
|
||||
return text
|
||||
}
|
||||
|
||||
let truncated = ''
|
||||
let length = 0
|
||||
// Tokenize the text into segments: either special tokens or plain text
|
||||
// This allows us to count visible characters correctly
|
||||
// Order matters: more specific patterns first
|
||||
const tokenPatterns = [
|
||||
{ pattern: /\[@([^\]]+?)\]\(\/?item\/[^)]+\)/g, type: 'mention' }, // @mentions - visible: @label
|
||||
{ pattern: /<https?:\/\/[^>]+>/g, type: 'autolink' }, // <url> autolinks - visible: the whole thing (for videos etc)
|
||||
{ pattern: /\[([^\]]*)\]\([^)]+\)/g, type: 'link' }, // [text](url) - visible: text
|
||||
{ pattern: /(?<!\(|<)https?:\/\/[^\s)<>]+/g, type: 'nakedurl' }, // naked URLs - visible: URL without protocol
|
||||
{ pattern: /(?<!\[)#([a-zA-Z0-9À-ÖØ-öø-ʸ_-]+)/g, type: 'hashtag' }, // #tag - visible: #tag (not inside links)
|
||||
]
|
||||
|
||||
const paragraphs = text.split('\n')
|
||||
// Find all tokens with their positions
|
||||
interface Token {
|
||||
start: number
|
||||
end: number
|
||||
raw: string
|
||||
visible: string
|
||||
type: string
|
||||
}
|
||||
|
||||
for (const paragraph of paragraphs) {
|
||||
const plainParagraph = removeMarkdownSyntax(paragraph)
|
||||
const tokens: Token[] = []
|
||||
|
||||
if (length + plainParagraph.length > limit) {
|
||||
// Calculate how many chars we can take from this paragraph
|
||||
const remaining = limit - length
|
||||
if (remaining > 0) {
|
||||
truncated += paragraph.slice(0, remaining) + '...'
|
||||
for (const { pattern, type } of tokenPatterns) {
|
||||
pattern.lastIndex = 0
|
||||
let match: RegExpExecArray | null
|
||||
while ((match = pattern.exec(text)) !== null) {
|
||||
const matchIndex = match.index
|
||||
const matchFull = match[0]
|
||||
const matchGroup = match[1] || ''
|
||||
|
||||
let visible: string
|
||||
if (type === 'mention') {
|
||||
visible = '@' + matchGroup
|
||||
} else if (type === 'link') {
|
||||
visible = matchGroup
|
||||
} else if (type === 'autolink') {
|
||||
// Autolinks like <https://youtube.com/...> - for truncation, count as short placeholder
|
||||
// since they'll be rendered as embeds or converted
|
||||
visible = '[video]'
|
||||
} else if (type === 'nakedurl') {
|
||||
// Naked URLs will be converted to links by preprocessMarkdown
|
||||
// The visible text will be the URL without https://www.
|
||||
visible = matchFull.replace(/^https?:\/\/(www\.)?/, '')
|
||||
} else {
|
||||
visible = matchFull // hashtag includes the #
|
||||
}
|
||||
|
||||
// Check if this position overlaps with existing tokens (avoid duplicates)
|
||||
const overlaps = tokens.some(
|
||||
(t) =>
|
||||
(matchIndex >= t.start && matchIndex < t.end) ||
|
||||
(matchIndex + matchFull.length > t.start && matchIndex + matchFull.length <= t.end),
|
||||
)
|
||||
|
||||
if (!overlaps) {
|
||||
tokens.push({
|
||||
start: matchIndex,
|
||||
end: matchIndex + matchFull.length,
|
||||
raw: matchFull,
|
||||
visible,
|
||||
type,
|
||||
})
|
||||
}
|
||||
break
|
||||
} else {
|
||||
truncated += paragraph + '\n'
|
||||
length += plainParagraph.length
|
||||
}
|
||||
}
|
||||
|
||||
return truncated.trim()
|
||||
// Sort tokens by position
|
||||
tokens.sort((a, b) => a.start - b.start)
|
||||
|
||||
// Build truncated output by walking through text
|
||||
let result = ''
|
||||
let visibleLength = 0
|
||||
let pos = 0
|
||||
|
||||
while (pos < text.length && visibleLength < limit) {
|
||||
// Check if we're at a token
|
||||
const token = tokens.find((t) => t.start === pos)
|
||||
|
||||
if (token) {
|
||||
// Would this token exceed the limit?
|
||||
if (visibleLength + token.visible.length > limit) {
|
||||
// Don't include partial token - stop here
|
||||
break
|
||||
}
|
||||
result += token.raw
|
||||
visibleLength += token.visible.length
|
||||
pos = token.end
|
||||
} else {
|
||||
// Check if next position is inside a token (shouldn't happen, but safety check)
|
||||
const insideToken = tokens.find((t) => pos > t.start && pos < t.end)
|
||||
if (insideToken) {
|
||||
pos = insideToken.end
|
||||
continue
|
||||
}
|
||||
|
||||
// Regular character - check for newline
|
||||
// eslint-disable-next-line security/detect-object-injection
|
||||
const char = text[pos]
|
||||
if (char === '\n') {
|
||||
result += char
|
||||
pos++
|
||||
// Don't count newlines toward visible limit
|
||||
} else {
|
||||
// Would this char exceed limit?
|
||||
if (visibleLength + 1 > limit) {
|
||||
break
|
||||
}
|
||||
result += char
|
||||
visibleLength++
|
||||
pos++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add ellipsis if we truncated
|
||||
if (pos < text.length) {
|
||||
result = result.trimEnd() + '...'
|
||||
}
|
||||
|
||||
return result.trim()
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user