feat: read-aloud highlights and scrolls to current sentence/word

- Add ReadingMark TipTap extension (transient, never saved to DB) that renders the active TTS passage as <span class='reading-word'> - Build a char→ProseMirror position map on read-start so boundary events can pinpoint exact document positions - Use onstart (fires on every utterance/voice) for reliable sentence-level highlight; onboundary overrides with word-level when the voice supports it - Auto-scroll the highlighted span into view (smooth, centred) on each update - Strip readingWord marks from JSON alongside lintError before saving - Guard all mark dispatches with applyingLints flag to suppress spurious saves and lint re-checks Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-24 22:48:27 -04:00 · 2026-05-24 22:48:27 -04:00 · 4db65151c8
commit 4db65151c8
parent 6682810c00
3 changed files with 196 additions and 19 deletions
--- a/frontend/src/components/Editor.jsx
+++ b/frontend/src/components/Editor.jsx
@ -27,6 +27,16 @@ const CustomImage = Image.extend({
  },
 }).configure({ allowBase64: false, inline: false })

+// ── ReadingMark — transient mark that follows the TTS playhead ────────────
+const ReadingMark = Mark.create({
+  name: 'readingWord',
+  inclusive: false,
+  parseHTML() { return [] },   // never restore from HTML — transient UI only
+  renderHTML({ HTMLAttributes }) {
+    return ['span', mergeAttributes({ class: 'reading-word' }, HTMLAttributes), 0]
+  },
+})
+
 // ── LintMark — transient mark for spelling / grammar underlines ────────────
 const LintMark = Mark.create({
  name: 'lintError',
@ -89,12 +99,12 @@ function clearLintMarks(editor) {
  editor.view.dispatch(tr)
 }

-// Strip lint marks from the JSON before saving (they're UI-only)
+// Strip transient UI marks (lint, reading playhead) from JSON before saving
 function stripLintMarks(node) {
  if (!node) return node
  const n = { ...node }
  if (n.marks) {
-    n.marks = n.marks.filter(m => m.type !== 'lintError')
+    n.marks = n.marks.filter(m => m.type !== 'lintError' && m.type !== 'readingWord')
    if (!n.marks.length) delete n.marks
  }
  if (n.content) n.content = n.content.map(stripLintMarks)
@ -114,7 +124,7 @@ const Editor = forwardRef(function Editor(
  const lintStatusRef  = useRef('idle')
  const lintDebounce   = useRef(null)
  const runLintRef     = useRef(null)   // always holds latest runLint, safe to call from onUpdate closure
-  const applyingLints  = useRef(false)  // true while we're dispatching mark transactions — skip onUpdate
+  const applyingLints  = useRef(false)  // true while dispatching mark transactions — suppresses onUpdate

  const editor = useEditor({
    extensions: [
@ -122,6 +132,7 @@ const Editor = forwardRef(function Editor(
      Underline,
      CustomImage,
      LintMark,
+      ReadingMark,
      Placeholder.configure({ placeholder: 'Begin your story here…' }),
      CharacterCount,
      TextAlign.configure({ types: ['heading', 'paragraph'] }),
@ -221,6 +232,42 @@ const Editor = forwardRef(function Editor(
  // Clean up the debounce timer on unmount
  useEffect(() => () => clearTimeout(lintDebounce.current), [])

+  // ── Reading-word mark (TTS playhead highlight) ────────────────────────────
+  // Called by Toolbar on each word-boundary event.  Both operations are wrapped
+  // in applyingLints so the resulting onUpdate dispatch is silently ignored —
+  // the same guard that protects lint mark dispatches also covers these.
+  const applyReadingMark = useCallback((from, to) => {
+    if (!editor) return
+    const mt = editor.state.schema.marks.readingWord
+    if (!mt) return
+    const { tr } = editor.state
+    // Clear any existing reading mark in one shot then set the new one
+    editor.state.doc.descendants((node, pos) => {
+      if (!node.isText) return
+      node.marks.filter(m => m.type === mt)
+        .forEach(() => tr.removeMark(pos, pos + node.nodeSize, mt))
+    })
+    tr.addMark(from, to, mt.create())
+    applyingLints.current = true
+    editor.view.dispatch(tr)
+    applyingLints.current = false
+  }, [editor])
+
+  const clearReadingMark = useCallback(() => {
+    if (!editor) return
+    const mt = editor.state.schema.marks.readingWord
+    if (!mt) return
+    const { tr } = editor.state
+    editor.state.doc.descendants((node, pos) => {
+      if (!node.isText) return
+      node.marks.filter(m => m.type === mt)
+        .forEach(() => tr.removeMark(pos, pos + node.nodeSize, mt))
+    })
+    applyingLints.current = true
+    editor.view.dispatch(tr)
+    applyingLints.current = false
+  }, [editor])
+
  // ── Popover on clicking a lint mark ──────────────────────────────────────
  function handleEditorClick(e) {
    if (!editor) return
@ -281,6 +328,8 @@ const Editor = forwardRef(function Editor(
        onLint={runLint}
        lintStatus={lintStatus}
        lintCount={lintCount}
+        applyReadingMark={applyReadingMark}
+        clearReadingMark={clearReadingMark}
      />
      <div className="editor-wrap" onClick={handleEditorClick}>
        <EditorContent editor={editor} className="editor-body" spellCheck={false} />
--- a/frontend/src/components/Toolbar.jsx
+++ b/frontend/src/components/Toolbar.jsx
@ -1,6 +1,73 @@
 import { useRef, useState, useEffect, useCallback } from 'react'

-export default function Toolbar({ editor, onImageUpload, fontSize, onFontSizeChange, onLint, lintStatus, lintCount }) {
+// ── TTS position utilities ─────────────────────────────────────────────────
+
+// Build a flat character array + parallel ProseMirror-position map for the
+// document range [from, to).  Positions are absolute (doc-level).
+// Block boundaries get a '\n' character with posMap entry = null.
+function buildReadingMap(doc, from, to) {
+  const chars = [], posMap = []
+  doc.nodesBetween(from, to, (node, pos) => {
+    if (node.isText) {
+      for (let i = 0; i < node.text.length; i++) {
+        const absPos = pos + i
+        if (absPos >= from && absPos < to) {
+          chars.push(node.text[i])
+          posMap.push(absPos)
+        }
+      }
+      return false  // text nodes have no children
+    }
+    // Insert a newline separator between block nodes
+    if (node.isBlock && chars.length > 0 && chars[chars.length - 1] !== '\n') {
+      chars.push('\n')
+      posMap.push(null)
+    }
+  })
+  return { text: chars.join(''), posMap }
+}
+
+// Given charIdx (an index into text/posMap), return the PM [from, to) span
+// of the word that contains that character.
+function wordAtIndex(text, posMap, charIdx) {
+  if (charIdx < 0 || charIdx >= text.length || !/\w/.test(text[charIdx])) return { pmFrom: null, pmTo: null }
+  // Expand backward to word start
+  let start = charIdx
+  while (start > 0 && /\w/.test(text[start - 1])) start--
+  // Expand forward to word end
+  let end = charIdx
+  while (end < text.length && /\w/.test(text[end])) end++
+  if (start === end) return { pmFrom: null, pmTo: null }
+  // Map character span → PM positions
+  let pmFrom = null, pmTo = null
+  for (let i = start; i < end; i++) {
+    if (posMap[i] !== null) {
+      if (pmFrom === null) pmFrom = posMap[i]
+      pmTo = posMap[i] + 1
+    }
+  }
+  return { pmFrom, pmTo }
+}
+
+// Return the PM span for an entire chunk (used for sentence-level fallback highlight).
+// Trims trailing punctuation/whitespace so the highlight ends at the last real word.
+function chunkPmRange(text, posMap, chunk) {
+  const start = chunk.startOffset
+  let end = start + chunk.text.length
+  while (end > start && !/\w/.test(text[end - 1])) end--
+  let pmFrom = null, pmTo = null
+  for (let i = start; i < end && i < posMap.length; i++) {
+    if (posMap[i] !== null) {
+      if (pmFrom === null) pmFrom = posMap[i]
+      pmTo = posMap[i] + 1
+    }
+  }
+  return { pmFrom, pmTo }
+}
+
+// ──────────────────────────────────────────────────────────────────────────
+
+export default function Toolbar({ editor, onImageUpload, fontSize, onFontSizeChange, onLint, lintStatus, lintCount, applyReadingMark, clearReadingMark }) {
  const fileRef      = useRef()
  const [isReading, setIsReading] = useState(false)
  const isReadingRef = useRef(false)   // ref so closure in next() always sees current value
@ -98,20 +165,27 @@ export default function Toolbar({ editor, onImageUpload, fontSize, onFontSizeCha
  function startReading() {
    if (!('speechSynthesis' in window)) return

-    // Grab text from cursor (or selection start) to end of document
    const { from } = editor.state.selection
    const end = editor.state.doc.content.size
-    const text = editor.state.doc.textBetween(from, end, '\n', ' ').trim()
-    if (!text) return

-    // Split into sentences — Chrome stops an utterance after ~15 s if it's too long
-    const chunks = (
-      text.match(/[^.!?…]+[.!?…]*['"'"]?\s*/g)
-        ?.map(s => s.trim())
-        .filter(Boolean)
-    ) || [text]
+    // Build char array + PM-position map so boundary events can pinpoint words
+    const { text, posMap } = buildReadingMap(editor.state.doc, from, end)

-    speechSynthesis.cancel()   // clear any leftover utterance
+    // Split into sentence-sized chunks (Chrome drops utterances > ~15 s).
+    // Track each chunk's start offset in `text` so onboundary charIndex can be
+    // translated back to an absolute position in the posMap.
+    const chunks = []
+    let m
+    const re = /[^.!?…]+[.!?…]*['"'"]?\s*/g
+    while ((m = re.exec(text)) !== null) {
+      if (m[0].trim()) chunks.push({ text: m[0], startOffset: m.index })
+    }
+    if (!chunks.length) {
+      if (text.trim()) chunks.push({ text, startOffset: 0 })
+      else return
+    }
+
+    speechSynthesis.cancel()
    isReadingRef.current = true
    setIsReading(true)

@ -122,13 +196,56 @@ export default function Toolbar({ editor, onImageUpload, fontSize, onFontSizeCha
      if (!isReadingRef.current || idx >= chunks.length) {
        isReadingRef.current = false
        setIsReading(false)
+        clearReadingMark?.()
        return
      }
-      const u = new SpeechSynthesisUtterance(chunks[idx++])
+      const chunk = chunks[idx++]
+      const u = new SpeechSynthesisUtterance(chunk.text)
      if (chosenVoice) u.voice = chosenVoice
      u.rate = ttsRateRef.current
+
+      // ── Sentence-level highlight (reliable fallback) ──
+      // onstart fires on every utterance in every browser/voice combination.
+      // Highlight the whole sentence immediately so there's always visible
+      // tracking, even when word-boundary events aren't available.
+      u.onstart = () => {
+        const { pmFrom, pmTo } = chunkPmRange(text, posMap, chunk)
+        if (pmFrom !== null) {
+          applyReadingMark?.(pmFrom, pmTo)
+          requestAnimationFrame(() => {
+            editor.view.dom
+              .querySelector('.reading-word')
+              ?.scrollIntoView({ behavior: 'smooth', block: 'center' })
+          })
+        }
+      }
+
+      // ── Word-level highlight (best-effort via boundary events) ──
+      // Many Linux voices / Firefox don't fire onboundary; when they do,
+      // this overrides the sentence highlight with a tighter word highlight.
+      u.onboundary = (e) => {
+        if (e.name === 'sentence') return
+        const textIdx = chunk.startOffset + (e.charIndex ?? 0)
+        const { pmFrom, pmTo } = wordAtIndex(text, posMap, textIdx)
+        if (pmFrom !== null) {
+          applyReadingMark?.(pmFrom, pmTo)
+          requestAnimationFrame(() => {
+            editor.view.dom
+              .querySelector('.reading-word')
+              ?.scrollIntoView({ behavior: 'smooth', block: 'center' })
+          })
+        }
+      }
+
      u.onend = next
-      u.onerror = () => { isReadingRef.current = false; setIsReading(false) }
+      u.onerror = (e) => {
+        // 'interrupted' just means cancel() was called — not a real error
+        if (e.error !== 'interrupted') {
+          isReadingRef.current = false
+          setIsReading(false)
+          clearReadingMark?.()
+        }
+      }
      speechSynthesis.speak(u)
    }
    next()
@ -138,6 +255,7 @@ export default function Toolbar({ editor, onImageUpload, fontSize, onFontSizeCha
    isReadingRef.current = false
    setIsReading(false)
    speechSynthesis.cancel()
+    clearReadingMark?.()
  }

  if (!editor) return null
--- a/frontend/src/styles/index.css
+++ b/frontend/src/styles/index.css
@ -1479,6 +1479,16 @@ button { cursor: pointer; font-family: inherit; }
  50%       { opacity: 0.45; }
 }

+/* ── Read-aloud word highlight ────────────────────────── */
+
+/* Rendered by ReadingMark — tracks the TTS playhead word by word */
+.reading-word {
+  background: rgba(251, 191, 36, 0.35);   /* warm amber */
+  border-radius: 2px;
+  outline: 1px solid rgba(251, 191, 36, 0.55);
+  outline-offset: 1px;
+}
+
 /* ── Spell / Grammar Check ────────────────────────────── */

 /* Wavy underlines on lint-marked text */