first commit

2026-03-21 18:57:10 +08:00
commit c49aa1a5e9
570 changed files with 107167 additions and 0 deletions
--- a/apps/miniprogram/pages/terminal/terminalSpeakableText.js
+++ b/apps/miniprogram/pages/terminal/terminalSpeakableText.js
@@ -0,0 +1,284 @@
+/* global module, require */
+
+const {
+  DEFAULT_TTS_SPEAKABLE_MAX_CHARS,
+  TTS_SEGMENT_MAX_CHARS,
+  TTS_SEGMENT_MAX_UTF8_BYTES,
+  normalizeTtsSpeakableMaxChars,
+  normalizeTtsSegmentMaxChars,
+  resolveTtsSpeakableUtf8ByteLimit,
+  resolveTtsSegmentUtf8ByteLimit
+} = require("../../utils/ttsSettings");
+
+const ANSI_ESCAPE_PATTERN = /\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])/g;
+const BOX_DRAWING_PATTERN = /[┌┐└┘├┤┬┴┼│─╭╮╯╰═║╔╗╚╝]/;
+const COMMAND_PREFIX_PATTERN =
+  /^\s*(?:[$#>]|>>>\s|(?:cd|ls|pwd|git|npm|pnpm|yarn|bun|node|npx|cat|grep|sed|awk|ssh|scp|rm|mv|cp|mkdir|touch|python|pip|cargo|go|java|docker|kubectl)\b)/i;
+const CODE_TOKEN_PATTERN = /(?:=>|::|===|!==|&&|\|\||\{|\}|\[|\]|<\/?|\/>|;)/g;
+const PATH_LINE_PATTERN =
+  /^\s*(?:~?\/\S+|\.{1,2}\/\S+|[A-Za-z]:\\\S+|(?:[A-Za-z0-9._-]+\/){2,}[A-Za-z0-9._-]+|[A-Za-z0-9._-]+@[A-Za-z0-9.-]+:[^\s]+)\s*$/;
+const URL_LINE_PATTERN = /^\s*https?:\/\/\S+\s*$/i;
+const PROGRESS_LINE_PATTERN = /(?:\b\d{1,3}%\b|\[[=>.\- ]{3,}\]|\bETA\b|\b\d+\/\d+\b|spinner|loading)/i;
+const CODEX_INPUT_LINE_PATTERN = /^\s*[›»❯➜]\s+/;
+const CODEX_FOOTER_LINE_PATTERN =
+  /\b(?:gpt-\d(?:\.\d+)?|claude(?:-[a-z0-9.-]+)?|gemini(?:-[a-z0-9.-]+)?|deepseek(?:-[a-z0-9.-]+)?|o\d(?:-[a-z0-9.-]+)?|sonnet|haiku|opus)\b.*(?:\b\d{1,3}%\s+(?:left|context left)\b|~\/\S*)/i;
+const CODEX_FOOTER_FRAGMENT_PATTERN =
+  /(?:\b\d{1,3}%\s+(?:left|context left)\b.*~\/\S*|~\/\S*.*\b\d{1,3}%\s+(?:left|context left)\b)/i;
+const CODEX_STATUS_LINE_PATTERN =
+  /^\s*(?:[!！⚠■●•]\s*)?(?:Working(?:\s|\(|$)|Tip:|Tips?:|Heads up\b|Conversation interrupted\b|Something went wrong\b|Hit\s+`?\/feedback`?\b|Booting MCP server:|MCP server:)/i;
+const CHINESE_STATUS_LINE_PATTERN =
+  /^\s*(?:正在(?:分析|处理|读取|扫描|生成|检查|加载|连接|收集|整理|搜索)|(?:分析|处理|读取|加载|连接|生成)(?:中|中\.\.\.|中…+))[^。！？!?]{0,80}(?:\.\.\.|…+)?\s*$/;
+const NATURAL_TEXT_PATTERN = /[\u3400-\u9fff]|[A-Za-z]{3,}/;
+const SYMBOL_CHAR_PATTERN = /[\\\/[\]{}()<>_=+*`|#@$%^~]/g;
+const MAX_SPEAKABLE_CHARS = DEFAULT_TTS_SPEAKABLE_MAX_CHARS;
+const MAX_SPEAKABLE_UTF8_BYTES = resolveTtsSpeakableUtf8ByteLimit(DEFAULT_TTS_SPEAKABLE_MAX_CHARS);
+
+function stripTerminalAnsi(text) {
+  return String(text || "")
+    .replace(/\r/g, "")
+    .replace(ANSI_ESCAPE_PATTERN, "");
+}
+
+function normalizeSpeakableLine(line) {
+  return stripTerminalAnsi(line)
+    .replace(/[ \t\f\v]+/g, " ")
+    .replace(/\u00a0/g, " ")
+    .trim();
+}
+
+function cleanSpeakableLine(line) {
+  return String(line || "")
+    .replace(/^\s*(?:(?:[-*+]\s+|[•●○◦▪■·]\s*|\d+[.)、]\s+))/, "")
+    .replace(/`([^`]+)`/g, "$1")
+    .replace(/\s{2,}/g, " ")
+    .trim();
+}
+
+function isCommandLikeLine(line) {
+  return COMMAND_PREFIX_PATTERN.test(line);
+}
+
+function isCodeLikeLine(line) {
+  if (!line) return false;
+  if (/^\s*```/.test(line)) return true;
+  if (/^\s*(?:const|let|var|function|class|import|export|return|if|for|while)\b/.test(line)) return true;
+  const codeTokenCount = (line.match(CODE_TOKEN_PATTERN) || []).length;
+  return codeTokenCount >= 3;
+}
+
+function hasHighSymbolDensity(line) {
+  const visible = String(line || "").replace(/\s/g, "");
+  if (!visible) return false;
+  const symbols = (visible.match(SYMBOL_CHAR_PATTERN) || []).length;
+  return symbols / visible.length >= 0.22;
+}
+
+function isSpeakableLine(line) {
+  if (!line) return false;
+  if (!NATURAL_TEXT_PATTERN.test(line)) return false;
+  if (BOX_DRAWING_PATTERN.test(line)) return false;
+  if (/^[-=_*]{4,}$/.test(line)) return false;
+  if (PROGRESS_LINE_PATTERN.test(line)) return false;
+  if (CODEX_INPUT_LINE_PATTERN.test(line)) return false;
+  if (CODEX_FOOTER_LINE_PATTERN.test(line)) return false;
+  if (CODEX_FOOTER_FRAGMENT_PATTERN.test(line)) return false;
+  if (CODEX_STATUS_LINE_PATTERN.test(line)) return false;
+  if (CHINESE_STATUS_LINE_PATTERN.test(line)) return false;
+  if (PATH_LINE_PATTERN.test(line) || URL_LINE_PATTERN.test(line)) return false;
+  if (isCommandLikeLine(line) || isCodeLikeLine(line)) return false;
+  if (hasHighSymbolDensity(line)) return false;
+  return true;
+}
+
+function collapseSpeakableText(text) {
+  return String(text || "")
+    .replace(/\s*\n\s*/g, " ")
+    .replace(/\s{2,}/g, " ")
+    .replace(/([，。！？；：,.!?;:])\1{1,}/g, "$1")
+    .replace(/([，。！？；：,.!?;:])\s+([A-Za-z\u3400-\u9fff])/g, "$1$2")
+    .replace(/([\u3400-\u9fff])\s+([\u3400-\u9fff])/g, "$1$2")
+    .trim();
+}
+
+function utf8ByteLength(text) {
+  let total = 0;
+  const source = String(text || "");
+  for (const char of source) {
+    const codePoint = char.codePointAt(0) || 0;
+    if (codePoint <= 0x7f) {
+      total += 1;
+    } else if (codePoint <= 0x7ff) {
+      total += 2;
+    } else if (codePoint <= 0xffff) {
+      total += 3;
+    } else {
+      total += 4;
+    }
+  }
+  return total;
+}
+
+function trimSpeakableText(text, maxChars, maxUtf8Bytes) {
+  const source = String(text || "");
+  const charLimit = normalizeTtsSpeakableMaxChars(maxChars);
+  const utf8Limit = Math.max(1, Math.round(Number(maxUtf8Bytes) || resolveTtsSpeakableUtf8ByteLimit(charLimit)));
+  if (source.length <= charLimit && utf8ByteLength(source) <= utf8Limit) {
+    return source;
+  }
+  let result = "";
+  let usedBytes = 0;
+  for (const char of source) {
+    if (result.length >= charLimit) {
+      break;
+    }
+    const nextBytes = utf8ByteLength(char);
+    if (usedBytes + nextBytes > utf8Limit) {
+      break;
+    }
+    result += char;
+    usedBytes += nextBytes;
+  }
+  return result
+    .replace(/[，、；：,.!?;:\s]+$/g, "")
+    .trim();
+}
+
+function splitSpeakableTextForTts(text, options) {
+  const config = options && typeof options === "object" ? options : {};
+  const source = collapseSpeakableText(text);
+  if (!source) {
+    return [];
+  }
+  const maxChars = normalizeTtsSegmentMaxChars(config.maxChars || TTS_SEGMENT_MAX_CHARS);
+  const maxUtf8Bytes = Math.max(
+    1,
+    Math.round(Number(config.maxUtf8Bytes) || resolveTtsSegmentUtf8ByteLimit(maxChars))
+  );
+  const chars = Array.from(source);
+  const segments = [];
+  let cursor = 0;
+
+  /**
+   * 分段策略优先找句号/问号/分号等强断点；
+   * 如果当前窗口里没有完整句子，再退回逗号或空白，避免整段都卡到硬切。
+   */
+  while (cursor < chars.length) {
+    while (cursor < chars.length && /[\s，、；：,.!?;:]/.test(chars[cursor])) {
+      cursor += 1;
+    }
+    if (cursor >= chars.length) {
+      break;
+    }
+    let usedBytes = 0;
+    let end = cursor;
+    let lastStrongBreak = -1;
+    let lastSoftBreak = -1;
+    while (end < chars.length) {
+      const char = chars[end];
+      const nextBytes = utf8ByteLength(char);
+      if (end - cursor >= maxChars || usedBytes + nextBytes > maxUtf8Bytes) {
+        break;
+      }
+      usedBytes += nextBytes;
+      end += 1;
+      if (/[。！？!?；;：:]/.test(char)) {
+        lastStrongBreak = end;
+      } else if (/[，、,.]/.test(char) || /\s/.test(char)) {
+        lastSoftBreak = end;
+      }
+    }
+
+    let nextEnd = end;
+    const consumedChars = end - cursor;
+    const strongBreakFloor = Math.max(12, Math.floor(maxChars * 0.55));
+    const softBreakFloor = Math.max(12, Math.floor(maxChars * 0.45));
+
+    if (end < chars.length) {
+      if (lastStrongBreak >= cursor + strongBreakFloor) {
+        nextEnd = lastStrongBreak;
+      } else if (lastSoftBreak >= cursor + softBreakFloor) {
+        nextEnd = lastSoftBreak;
+      }
+    }
+
+    if (nextEnd <= cursor) {
+      nextEnd = Math.max(cursor + 1, end);
+    }
+
+    const segment = chars.slice(cursor, nextEnd).join("").trim();
+    if (!segment && consumedChars > 0) {
+      segments.push(chars.slice(cursor, end).join("").trim());
+      cursor = end;
+      continue;
+    }
+    if (segment) {
+      segments.push(segment);
+    }
+    cursor = nextEnd;
+  }
+
+  return segments.filter((segment) => !!segment);
+}
+
+/**
+ * 从一轮终端可见输出中抽取“最近一批适合朗读的自然语言”：
+ * 1. 仍然优先保留轮次尾部最近内容，但不再要求必须是单个连续段；
+ * 2. 中间若夹杂代码、路径、状态行，直接跳过并继续向上回溯；
+ * 3. 收口逻辑保持在短文本范围内，避免把整轮历史都送进 TTS。
+ */
+function buildSpeakableTerminalText(source, options) {
+  const config = options && typeof options === "object" ? options : {};
+  const maxChars = normalizeTtsSpeakableMaxChars(config.maxChars);
+  const maxUtf8Bytes = Math.max(
+    1,
+    Math.round(Number(config.maxUtf8Bytes) || resolveTtsSpeakableUtf8ByteLimit(maxChars))
+  );
+  const text = Array.isArray(source) ? source.join("\n") : String(source || "");
+  const normalized = stripTerminalAnsi(text);
+  if (!normalized.trim()) {
+    return "";
+  }
+  const lines = normalized.split(/\n+/).map(normalizeSpeakableLine);
+  const collected = [];
+  let collectedChars = 0;
+  let collectedBytes = 0;
+  for (let index = lines.length - 1; index >= 0; index -= 1) {
+    const line = lines[index];
+    if (!line) {
+      continue;
+    }
+    if (!isSpeakableLine(line)) {
+      continue;
+    }
+    const cleaned = cleanSpeakableLine(line);
+    if (!cleaned) {
+      continue;
+    }
+    const separatorChars = collected.length > 0 ? 1 : 0;
+    const nextChars = cleaned.length + separatorChars;
+    const nextBytes = utf8ByteLength(cleaned) + separatorChars;
+    if (collected.length > 0 && (collectedChars + nextChars > maxChars || collectedBytes + nextBytes > maxUtf8Bytes)) {
+      break;
+    }
+    if (collected.length === 0 && (cleaned.length > maxChars || utf8ByteLength(cleaned) > maxUtf8Bytes)) {
+      collected.unshift(trimSpeakableText(cleaned, maxChars, maxUtf8Bytes));
+      break;
+    }
+    collected.unshift(cleaned);
+    collectedChars += nextChars;
+    collectedBytes += nextBytes;
+  }
+  return trimSpeakableText(collapseSpeakableText(collected.join("\n")), maxChars, maxUtf8Bytes);
+}
+
+function isSpeakableTextLikelyComplete(text) {
+  return /(?:[。！？!?：:]|\.{1}|。{1})\s*$/.test(String(text || "").trim());
+}
+
+module.exports = {
+  MAX_SPEAKABLE_CHARS,
+  buildSpeakableTerminalText,
+  isSpeakableTextLikelyComplete,
+  splitSpeakableTextForTts,
+  stripTerminalAnsi
+};