Suanming-Web/knowledge-base.js

const fs = require('fs/promises');
const path = require('path');

const STOPWORDS = new Set([
  '我们',
  '你们',
  '他们',
  '可以',
  '如果',
  '因为',
  '所以',
  '然后',
  '这个',
  '那个',
  '一个',
  '一些',
  '一下',
  '请问',
  '希望',
  '想要',
  '了解',
  '知道',
  '关于',
  '以及',
  '还有',
  '是否',
  '怎么',
  '如何',
  '什么',
  '就是',
  '目前',
  '最近',
  '近期',
  '今天',
  '今年',
  '明年',
  '时候',
  '方面',
  '内容',
  '问题',
]);

function toPositiveInteger(value, fallback) {
  const numeric = Number(value);
  if (!Number.isFinite(numeric) || numeric <= 0) {
    return fallback;
  }
  return Math.floor(numeric);
}

function normalizeText(input = '') {
  return String(input).toLowerCase();
}

function tokenizeQuery(input = '') {
  const normalized = normalizeText(input);
  const tokens = [];

  const latinTokens = normalized.match(/[a-z0-9_]{2,}/g) || [];
  tokens.push(...latinTokens);

  const chineseSegments = normalized
    .replace(/[^\u4e00-\u9fff]+/g, ' ')
    .split(/\s+/)
    .filter(Boolean);

  for (const segment of chineseSegments) {
    if (segment.length <= 4) {
      tokens.push(segment);
      continue;
    }

    const limit = Math.min(segment.length - 1, 40);
    for (let i = 0; i < limit; i += 1) {
      tokens.push(segment.slice(i, i + 2));
    }
    tokens.push(segment.slice(0, 4));
  }

  return [...new Set(tokens)]
    .filter((token) => token.length >= 2 && !STOPWORDS.has(token))
    .slice(0, 180);
}

function splitLongText(text, maxLen) {
  const chunks = [];
  for (let index = 0; index < text.length; index += maxLen) {
    chunks.push(text.slice(index, index + maxLen));
  }
  return chunks;
}

function extractYearsFromText(text = '') {
  const matches = String(text).match(/20\d{2}/g) || [];
  const years = matches
    .map((value) => Number(value))
    .filter((value) => Number.isInteger(value) && value >= 2000 && value <= 2100);

  return [...new Set(years)];
}

function splitDocumentIntoChunks(fileName, content, maxChunkChars) {
  const blocks = String(content)
    .split(/\n{2,}/)
    .map((block) => block.trim())
    .filter(Boolean);

  const chunks = [];
  const defaultHeading = fileName.replace(/\.md$/i, '');

  let heading = defaultHeading;
  let buffer = '';
  let sequence = 0;

  const pushChunk = () => {
    const text = buffer.trim();
    if (!text) {
      buffer = '';
      return;
    }

    sequence += 1;
    chunks.push({
      id: `${fileName}:${sequence}`,
      fileName,
      heading,
      text,
      normalized: normalizeText(text),
      charLength: text.length,
    });

    buffer = '';
  };

  for (const block of blocks) {
    const headingMatch = block.match(/^#{1,6}\s+(.+)$/);
    if (headingMatch) {
      pushChunk();
      heading = headingMatch[1].trim() || heading;
      continue;
    }

    if (!buffer) {
      if (block.length <= maxChunkChars) {
        buffer = block;
        continue;
      }

      const oversizedParts = splitLongText(block, maxChunkChars);
      for (const part of oversizedParts) {
        sequence += 1;
        chunks.push({
          id: `${fileName}:${sequence}`,
          fileName,
          heading,
          text: part,
          normalized: normalizeText(part),
          charLength: part.length,
        });
      }
      continue;
    }

    if (buffer.length + block.length + 2 <= maxChunkChars) {
      buffer += `\n\n${block}`;
      continue;
    }

    pushChunk();

    if (block.length <= maxChunkChars) {
      buffer = block;
      continue;
    }

    const oversizedParts = splitLongText(block, maxChunkChars);
    for (const part of oversizedParts) {
      sequence += 1;
      chunks.push({
        id: `${fileName}:${sequence}`,
        fileName,
        heading,
        text: part,
        normalized: normalizeText(part),
        charLength: part.length,
      });
    }
  }

  pushChunk();
  return chunks;
}

function scoreChunk(chunk, queryTokens, options = {}) {
  let score = 0;

  for (const token of queryTokens) {
    if (!token) continue;

    if (chunk.normalized.includes(token)) {
      if (token.length >= 4) {
        score += 4;
      } else if (token.length === 3) {
        score += 2;
      } else {
        score += 1;
      }
    }

    if (chunk.heading.includes(token)) {
      score += 2;
    }
  }

  const preferredYear = Number(options.preferredYear);
  if (Number.isInteger(preferredYear)) {
    const years = extractYearsFromText(`${chunk.heading}\n${chunk.text}`);
    if (years.length) {
      if (years.includes(preferredYear)) {
        score += 4;
      } else {
        score -= 2;
      }
    }
  }

  return score;
}

class KnowledgeBase {
  constructor(options = {}) {
    this.enabled = options.enabled !== false;
    this.documentsDir = options.documentsDir;
    this.chunkSize = toPositiveInteger(options.chunkSize, 1200);
    this.topK = toPositiveInteger(options.topK, 5);
    this.maxContextChars = toPositiveInteger(options.maxContextChars, 7000);
    this.minScore = toPositiveInteger(options.minScore, 2);
    this.defaultKeywords =
      options.defaultKeywords && options.defaultKeywords.length
        ? options.defaultKeywords
        : ['八字', '命理', '运势', '五行', '开运', '风水'];

    this.docs = [];
    this.chunks = [];
    this.lastLoadedAt = null;
    this.lastError = null;
    this.loadingPromise = null;
  }

  async ensureLoaded(force = false) {
    if (!this.enabled) {
      return;
    }

    if (!force && this.lastLoadedAt && this.chunks.length) {
      return;
    }

    if (this.loadingPromise) {
      await this.loadingPromise;
      return;
    }

    this.loadingPromise = this.loadFromDisk();

    try {
      await this.loadingPromise;
    } finally {
      this.loadingPromise = null;
    }
  }

  async loadFromDisk() {
    try {
      const entries = await fs.readdir(this.documentsDir, { withFileTypes: true });
      const markdownFiles = entries
        .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith('.md'))
        .map((entry) => entry.name)
        .sort((a, b) => a.localeCompare(b, 'zh-CN'));

      const docs = [];
      const chunks = [];

      for (const fileName of markdownFiles) {
        const filePath = path.join(this.documentsDir, fileName);
        const content = await fs.readFile(filePath, 'utf8');
        docs.push({ fileName, charLength: content.length });
        chunks.push(...splitDocumentIntoChunks(fileName, content, this.chunkSize));
      }

      this.docs = docs;
      this.chunks = chunks;
      this.lastLoadedAt = new Date();
      this.lastError = null;

      console.log(
        `[knowledge] loaded ${docs.length} docs, ${chunks.length} chunks from ${this.documentsDir}`
      );
    } catch (error) {
      this.lastError = error.message;
      console.error(`[knowledge] load failed: ${error.message}`);
      throw error;
    }
  }

  async getStatus() {
    if (!this.enabled) {
      return {
        enabled: false,
        documents_dir: this.documentsDir,
        docs_count: 0,
        chunks_count: 0,
        last_loaded_at: null,
        last_error: null,
      };
    }

    await this.ensureLoaded();

    return {
      enabled: true,
      documents_dir: this.documentsDir,
      docs_count: this.docs.length,
      chunks_count: this.chunks.length,
      last_loaded_at: this.lastLoadedAt ? this.lastLoadedAt.toISOString() : null,
      last_error: this.lastError,
    };
  }

  async buildContext(queryText, options = {}) {
    if (!this.enabled) {
      return {
        enabled: false,
        text: '',
        references: [],
      };
    }

    await this.ensureLoaded();

    if (!this.chunks.length) {
      return {
        enabled: true,
        text: '',
        references: [],
      };
    }

    const includeFiles = Array.isArray(options.includeFiles)
      ? options.includeFiles.filter(Boolean)
      : null;
    const excludeFiles = Array.isArray(options.excludeFiles)
      ? options.excludeFiles.filter(Boolean)
      : null;
    const topK = toPositiveInteger(options.topK, this.topK);
    const maxContextChars = toPositiveInteger(
      options.maxContextChars,
      this.maxContextChars
    );
    const preferredYear = Number.isInteger(Number(options.preferredYear))
      ? Number(options.preferredYear)
      : null;
    const dropMismatchedYears = options.dropMismatchedYears === true;

    const candidateChunks = this.chunks.filter((chunk) => {
      if (includeFiles && includeFiles.length) {
        const matched = includeFiles.some((fileName) => chunk.fileName.includes(fileName));
        if (!matched) return false;
      }

      if (excludeFiles && excludeFiles.length) {
        const blocked = excludeFiles.some((fileName) => chunk.fileName.includes(fileName));
        if (blocked) return false;
      }

      if (dropMismatchedYears && preferredYear) {
        const years = extractYearsFromText(`${chunk.heading}\n${chunk.text}`);
        if (years.length && !years.includes(preferredYear)) {
          return false;
        }
      }

      return true;
    });

    if (!candidateChunks.length) {
      return {
        enabled: true,
        text: '',
        references: [],
      };
    }

    const queryWithFallback = `${queryText || ''} ${this.defaultKeywords.join(' ')}`;
    const queryTokens = tokenizeQuery(queryWithFallback);

    const ranked = candidateChunks
      .map((chunk) => ({
        chunk,
        score: scoreChunk(chunk, queryTokens, { preferredYear }),
      }))
      .filter((item) => item.score >= this.minScore)
      .sort((a, b) => b.score - a.score);

    const selected = [];
    const selectedIds = new Set();
    let totalChars = 0;

    for (const item of ranked) {
      if (selected.length >= topK) break;
      if (selectedIds.has(item.chunk.id)) continue;

      if (totalChars + item.chunk.charLength > maxContextChars && selected.length > 0) {
        continue;
      }

      selected.push(item);
      selectedIds.add(item.chunk.id);
      totalChars += item.chunk.charLength;
    }

    if (!selected.length) {
      selected.push(
        ...candidateChunks
          .slice(0, Math.min(topK, candidateChunks.length))
          .map((chunk) => ({
            chunk,
            score: 0,
          }))
      );
    }

    const references = selected.map((item) => ({
      file: item.chunk.fileName,
      heading: item.chunk.heading,
      score: item.score,
    }));

    const text = selected
      .map(
        (item, index) =>
          `[资料${index + 1}] 来源: ${item.chunk.fileName} / ${item.chunk.heading}\n${item.chunk.text}`
      )
      .join('\n\n');

    return {
      enabled: true,
      text,
      references,
    };
  }
}

module.exports = {
  KnowledgeBase,
};