fix(lib): improve markdown processing and truncation

- Add convertNakedUrls() to skip URLs already inside markdown links - Rewrite truncateMarkdown() with token-aware truncation - Add @tiptap/markdown support to VideoEmbed, ItemMention, Hashtag - Fix double-conversion of URLs in existing links - Fix truncation cutting tokens in the middle - Fix eslint warnings with proper types Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-06 09:55:47 +00:00 · 2026-01-15 08:50:32 +01:00 · 2026-01-15 08:50:32 +01:00 · 3d7307b759
commit 3d7307b759
parent 3a8477f863
4 changed files with 286 additions and 67 deletions
--- a/lib/src/Components/Map/Subcomponents/ItemPopupComponents/TextView.tsx
+++ b/lib/src/Components/Map/Subcomponents/ItemPopupComponents/TextView.tsx
@ -9,11 +9,7 @@ import { useGetItemColor } from '#components/Map/hooks/useItemColor'
 import { useItems } from '#components/Map/hooks/useItems'
 import { useTags } from '#components/Map/hooks/useTags'
 import { Hashtag, ItemMention, VideoEmbed } from '#components/TipTap/extensions'
-import {
-  preprocessMarkdown,
-  removeMarkdownSyntax,
-  truncateMarkdown,
-} from '#components/TipTap/utils/preprocessMarkdown'
+import { removeMarkdownSyntax, truncateMarkdown } from '#components/TipTap/utils/preprocessMarkdown'

 import type { Item } from '#types/Item'

@ -80,8 +76,9 @@ export const TextView = ({
        }),
        VideoEmbed,
      ],
-      // Preprocess markdown to convert hashtags and item mentions to HTML
-      content: preprocessMarkdown(innerText),
+      // Load content as markdown - the extensions' markdownTokenizer handles parsing
+      content: innerText,
+      contentType: 'markdown',
      editable: false,
      editorProps: {
        attributes: {
@ -94,7 +91,7 @@ export const TextView = ({

  // Update content when text changes
  useEffect(() => {
-    editor.commands.setContent(preprocessMarkdown(innerText))
+    editor.commands.setContent(innerText, { contentType: 'markdown' })
  }, [editor, innerText])

  // Handle link clicks for internal navigation
--- a/lib/src/Components/Map/Subcomponents/ItemPopupComponents/TextViewStatic.tsx
+++ b/lib/src/Components/Map/Subcomponents/ItemPopupComponents/TextViewStatic.tsx
@ -5,11 +5,7 @@ import { useAddFilterTag } from '#components/Map/hooks/useFilter'
 import { useGetItemColor } from '#components/Map/hooks/useItemColor'
 import { useItems } from '#components/Map/hooks/useItems'
 import { useTags } from '#components/Map/hooks/useTags'
-import {
-  preprocessMarkdown,
-  removeMarkdownSyntax,
-  truncateMarkdown,
-} from '#components/TipTap/utils/preprocessMarkdown'
+import { preprocessMarkdown, truncateMarkdown } from '#components/TipTap/utils/preprocessMarkdown'
 import { simpleMarkdownToHtml } from '#components/TipTap/utils/simpleMarkdownToHtml'

 import type { Item } from '#types/Item'
@ -60,17 +56,22 @@ export const TextViewStatic = ({
    innerText = text
  }

-  // Apply truncation if needed
-  if (innerText && truncate) {
-    innerText = truncateMarkdown(removeMarkdownSyntax(innerText), 100)
-  }
-
-  // Pre-process and convert to HTML
+  // Pre-process markdown first (converts naked URLs to links, etc.)
+  // Then truncate the processed markdown
+  // Finally convert to HTML
  const html = useMemo(() => {
    if (!innerText) return ''
-    const processed = preprocessMarkdown(innerText)
+
+    // First preprocess to normalize all URLs/mentions/hashtags
+    let processed = preprocessMarkdown(innerText)
+
+    // Then truncate if needed (works on normalized markdown)
+    if (truncate) {
+      processed = truncateMarkdown(processed, 100)
+    }
+
    return simpleMarkdownToHtml(processed, tags, { items, getItemColor })
-  }, [innerText, tags, items, getItemColor])
+  }, [innerText, truncate, tags, items, getItemColor])

  // Handle clicks for internal navigation and hashtags
  useEffect(() => {
--- a/lib/src/Components/TipTap/extensions/VideoEmbed.tsx
+++ b/lib/src/Components/TipTap/extensions/VideoEmbed.tsx
@ -55,28 +55,81 @@ export const VideoEmbed = Node.create<VideoEmbedOptions>({
    }
  },

-  addStorage() {
+  // Markdown tokenizer for @tiptap/markdown - recognizes <https://youtube.com/...> and <https://rumble.com/...> syntax
+  markdownTokenizer: {
+    name: 'videoEmbed',
+    level: 'inline',
+    // Fast hint for the lexer - where might a video embed start?
+    start: (src: string) => {
+      // Look for autolinks with video URLs
+      const youtubeIndex = src.indexOf('<https://www.youtube.com/watch')
+      const youtubeShortIndex = src.indexOf('<https://youtu.be/')
+      const rumbleIndex = src.indexOf('<https://rumble.com/embed/')
+
+      const indices = [youtubeIndex, youtubeShortIndex, rumbleIndex].filter((i) => i >= 0)
+      return indices.length > 0 ? Math.min(...indices) : -1
+    },
+    tokenize: (src: string) => {
+      // Match YouTube autolinks: <https://www.youtube.com/watch?v=VIDEO_ID>
+      let match = /^<https?:\/\/(?:www\.)?youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})[^>]*>/.exec(
+        src,
+      )
+      if (match) {
+        return {
+          type: 'videoEmbed',
+          raw: match[0],
+          provider: 'youtube',
+          videoId: match[1],
+        }
+      }
+
+      // Match YouTube short autolinks: <https://youtu.be/VIDEO_ID>
+      match = /^<https?:\/\/youtu\.be\/([a-zA-Z0-9_-]{11})[^>]*>/.exec(src)
+      if (match) {
+        return {
+          type: 'videoEmbed',
+          raw: match[0],
+          provider: 'youtube',
+          videoId: match[1],
+        }
+      }
+
+      // Match Rumble autolinks: <https://rumble.com/embed/VIDEO_ID>
+      match = /^<https?:\/\/rumble\.com\/embed\/([a-zA-Z0-9]+)[^>]*>/.exec(src)
+      if (match) {
+        return {
+          type: 'videoEmbed',
+          raw: match[0],
+          provider: 'rumble',
+          videoId: match[1],
+        }
+      }
+
+      return undefined
+    },
+  },
+
+  // Parse Markdown token to Tiptap JSON
+  parseMarkdown(token: { provider: string; videoId: string }) {
    return {
-      markdown: {
-        serialize(
-          state: { write: (text: string) => void },
-          node: { attrs: { provider: string; videoId: string } },
-        ) {
-          const { provider, videoId } = node.attrs
-          const url =
-            provider === 'youtube'
-              ? `https://www.youtube.com/watch?v=${videoId}`
-              : `https://rumble.com/embed/${videoId}`
-          // Write as markdown autolink
-          state.write(`<${url}>`)
-        },
-        parse: {
-          // Parsing is handled by preprocessVideoLinks
-        },
+      type: 'videoEmbed',
+      attrs: {
+        provider: token.provider,
+        videoId: token.videoId,
      },
    }
  },

+  // Serialize Tiptap node to Markdown
+  renderMarkdown(node: { attrs: { provider: string; videoId: string } }) {
+    const { provider, videoId } = node.attrs
+    const url =
+      provider === 'youtube'
+        ? `https://www.youtube.com/watch?v=${videoId}`
+        : `https://rumble.com/embed/${videoId}`
+    return `<${url}>`
+  },
+
  addAttributes() {
    return {
      provider: {
--- a/lib/src/Components/TipTap/utils/preprocessMarkdown.ts
+++ b/lib/src/Components/TipTap/utils/preprocessMarkdown.ts
@ -4,6 +4,68 @@ import { fixUrls, mailRegex } from '#utils/ReplaceURLs'

 import type { JSONContent, Extensions } from '@tiptap/core'

+/**
+ * Converts naked URLs to markdown links, but skips URLs that are already
+ * inside markdown link syntax [text](url) or autolinks <url>.
+ */
+function convertNakedUrls(text: string): string {
+  // Find all existing markdown links and autolinks to know which ranges to skip
+  const skipRanges: { start: number; end: number }[] = []
+
+  // Find markdown links: [text](url)
+  const linkRegex = /\[[^\]]*\]\([^)]+\)/g
+  let linkMatch: RegExpExecArray | null
+  while ((linkMatch = linkRegex.exec(text)) !== null) {
+    skipRanges.push({ start: linkMatch.index, end: linkMatch.index + linkMatch[0].length })
+  }
+
+  // Find autolinks: <url>
+  const autolinkRegex = /<https?:\/\/[^>]+>/g
+  let autolinkMatch: RegExpExecArray | null
+  while ((autolinkMatch = autolinkRegex.exec(text)) !== null) {
+    skipRanges.push({
+      start: autolinkMatch.index,
+      end: autolinkMatch.index + autolinkMatch[0].length,
+    })
+  }
+
+  // Now find naked URLs and convert only those not in skip ranges
+  const urlRegex = /https?:\/\/[^\s)<>\]]+/g
+  let result = ''
+  let lastIndex = 0
+  let urlMatch: RegExpExecArray | null
+
+  while ((urlMatch = urlRegex.exec(text)) !== null) {
+    const urlStart = urlMatch.index
+    const urlEnd = urlMatch.index + urlMatch[0].length
+    const url = urlMatch[0]
+
+    // Check if this URL is inside a skip range
+    const isInsideSkipRange = skipRanges.some(
+      (range) => urlStart >= range.start && urlEnd <= range.end,
+    )
+
+    if (isInsideSkipRange) {
+      // Keep the URL as-is (it's already part of a link)
+      continue
+    }
+
+    // Add text before this URL
+    result += text.slice(lastIndex, urlStart)
+
+    // Convert naked URL to markdown link
+    const displayText = url.replace(/^https?:\/\/(www\.)?/, '')
+    result += `[${displayText}](${url})`
+
+    lastIndex = urlEnd
+  }
+
+  // Add remaining text
+  result += text.slice(lastIndex)
+
+  return result
+}
+
 /**
 * Converts pre-processed markdown/HTML to TipTap JSON format.
 * Creates a temporary editor instance to parse the content.
@ -38,11 +100,9 @@ export function preprocessMarkdown(text: string): string {
  result = fixUrls(result)

  // 2. Convert naked URLs to markdown links
-  // Match URLs that are NOT already inside markdown link syntax
-  result = result.replace(
-    /(?<!\]?\()(?<!<)https?:\/\/[^\s)]+(?!\))(?!>)/g,
-    (url) => `[${url.replace(/https?:\/\/w{3}\./gi, '')}](${url})`,
-  )
+  // Skip URLs that are already inside markdown link syntax [text](url) or autolinks <url>
+  // Process the text in segments to avoid matching URLs inside existing links
+  result = convertNakedUrls(result)

  // 3. Convert email addresses to mailto links
  result = result.replace(mailRegex, (email) => `[${email}](mailto:${email})`)
@ -148,20 +208,30 @@ export function preprocessItemMentions(text: string): string {

 /**
 * Removes markdown syntax for plain text display (used for truncation calculation).
+ * Preserves @mentions ([@Label](/item/id)) and #hashtags for rendering.
 */
 export function removeMarkdownSyntax(text: string): string {
-  return text
-    .replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
-    .replace(/(`{1,3})(.*?)\1/g, '$2') // Remove inline code
-    .replace(/(\*{1,2}|_{1,2})(.*?)\1/g, '$2') // Remove bold and italic
-    .replace(/(#+)\s+(.*)/g, '$2') // Remove headers
-    .replace(/>\s+(.*)/g, '$1') // Remove blockquotes
-    .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // Remove links, keep text
-    .replace(/<[^>]+>/g, '') // Remove HTML tags
+  return (
+    text
+      .replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
+      .replace(/(`{1,3})(.*?)\1/g, '$2') // Remove inline code
+      .replace(/(\*{1,2}|_{1,2})(.*?)\1/g, '$2') // Remove bold and italic
+      .replace(/(#+)\s+(.*)/g, '$2') // Remove headers
+      .replace(/>\s+(.*)/g, '$1') // Remove blockquotes
+      // Remove regular links but preserve @mentions ([@Label](/item/...))
+      .replace(/\[([^\]]+)\]\((?!\/item\/)[^)]+\)/g, '$1')
+      .replace(/<[^>]+>/g, '')
+  ) // Remove HTML tags
 }

 /**
- * Truncates text to a character limit, respecting paragraph boundaries.
+ * Truncates text to a character limit based on visible/plain text length.
+ * Preserves complete tokens - won't cut in the middle of:
+ * - @mentions: [@Label](/item/id)
+ * - #hashtags: #tagname
+ * - Links: [text](url)
+ *
+ * The limit applies to the rendered/visible text, not the raw markdown.
 */
 export function truncateMarkdown(text: string, limit: number): string {
  const plainText = removeMarkdownSyntax(text)
@ -170,26 +240,124 @@ export function truncateMarkdown(text: string, limit: number): string {
    return text
  }

-  let truncated = ''
-  let length = 0
+  // Tokenize the text into segments: either special tokens or plain text
+  // This allows us to count visible characters correctly
+  // Order matters: more specific patterns first
+  const tokenPatterns = [
+    { pattern: /\[@([^\]]+?)\]\(\/?item\/[^)]+\)/g, type: 'mention' }, // @mentions - visible: @label
+    { pattern: /<https?:\/\/[^>]+>/g, type: 'autolink' }, // <url> autolinks - visible: the whole thing (for videos etc)
+    { pattern: /\[([^\]]*)\]\([^)]+\)/g, type: 'link' }, // [text](url) - visible: text
+    { pattern: /(?<!\(|<)https?:\/\/[^\s)<>]+/g, type: 'nakedurl' }, // naked URLs - visible: URL without protocol
+    { pattern: /(?<!\[)#([a-zA-Z0-9À-ÖØ-öø-ʸ_-]+)/g, type: 'hashtag' }, // #tag - visible: #tag (not inside links)
+  ]

-  const paragraphs = text.split('\n')
+  // Find all tokens with their positions
+  interface Token {
+    start: number
+    end: number
+    raw: string
+    visible: string
+    type: string
+  }

-  for (const paragraph of paragraphs) {
-    const plainParagraph = removeMarkdownSyntax(paragraph)
+  const tokens: Token[] = []

-    if (length + plainParagraph.length > limit) {
-      // Calculate how many chars we can take from this paragraph
-      const remaining = limit - length
-      if (remaining > 0) {
-        truncated += paragraph.slice(0, remaining) + '...'
+  for (const { pattern, type } of tokenPatterns) {
+    pattern.lastIndex = 0
+    let match: RegExpExecArray | null
+    while ((match = pattern.exec(text)) !== null) {
+      const matchIndex = match.index
+      const matchFull = match[0]
+      const matchGroup = match[1] || ''
+
+      let visible: string
+      if (type === 'mention') {
+        visible = '@' + matchGroup
+      } else if (type === 'link') {
+        visible = matchGroup
+      } else if (type === 'autolink') {
+        // Autolinks like <https://youtube.com/...> - for truncation, count as short placeholder
+        // since they'll be rendered as embeds or converted
+        visible = '[video]'
+      } else if (type === 'nakedurl') {
+        // Naked URLs will be converted to links by preprocessMarkdown
+        // The visible text will be the URL without https://www.
+        visible = matchFull.replace(/^https?:\/\/(www\.)?/, '')
+      } else {
+        visible = matchFull // hashtag includes the #
+      }
+
+      // Check if this position overlaps with existing tokens (avoid duplicates)
+      const overlaps = tokens.some(
+        (t) =>
+          (matchIndex >= t.start && matchIndex < t.end) ||
+          (matchIndex + matchFull.length > t.start && matchIndex + matchFull.length <= t.end),
+      )
+
+      if (!overlaps) {
+        tokens.push({
+          start: matchIndex,
+          end: matchIndex + matchFull.length,
+          raw: matchFull,
+          visible,
+          type,
+        })
      }
-      break
-    } else {
-      truncated += paragraph + '\n'
-      length += plainParagraph.length
    }
  }

-  return truncated.trim()
+  // Sort tokens by position
+  tokens.sort((a, b) => a.start - b.start)
+
+  // Build truncated output by walking through text
+  let result = ''
+  let visibleLength = 0
+  let pos = 0
+
+  while (pos < text.length && visibleLength < limit) {
+    // Check if we're at a token
+    const token = tokens.find((t) => t.start === pos)
+
+    if (token) {
+      // Would this token exceed the limit?
+      if (visibleLength + token.visible.length > limit) {
+        // Don't include partial token - stop here
+        break
+      }
+      result += token.raw
+      visibleLength += token.visible.length
+      pos = token.end
+    } else {
+      // Check if next position is inside a token (shouldn't happen, but safety check)
+      const insideToken = tokens.find((t) => pos > t.start && pos < t.end)
+      if (insideToken) {
+        pos = insideToken.end
+        continue
+      }
+
+      // Regular character - check for newline
+      // eslint-disable-next-line security/detect-object-injection
+      const char = text[pos]
+      if (char === '\n') {
+        result += char
+        pos++
+        // Don't count newlines toward visible limit
+      } else {
+        // Would this char exceed limit?
+        if (visibleLength + 1 > limit) {
+          break
+        }
+        result += char
+        visibleLength++
+        pos++
+      }
+    }
+  }
+
+  // Add ellipsis if we truncated
+  if (pos < text.length) {
+    result = result.trimEnd() + '...'
+  }
+
+  return result.trim()
 }