Files
Suanming-Web/knowledge-base.js
Kevin Wong 1db55865c0 更新
2026-03-11 14:08:24 +08:00

457 lines
11 KiB
JavaScript

const fs = require('fs/promises');
const path = require('path');
const STOPWORDS = new Set([
'我们',
'你们',
'他们',
'可以',
'如果',
'因为',
'所以',
'然后',
'这个',
'那个',
'一个',
'一些',
'一下',
'请问',
'希望',
'想要',
'了解',
'知道',
'关于',
'以及',
'还有',
'是否',
'怎么',
'如何',
'什么',
'就是',
'目前',
'最近',
'近期',
'今天',
'今年',
'明年',
'时候',
'方面',
'内容',
'问题',
]);
function toPositiveInteger(value, fallback) {
const numeric = Number(value);
if (!Number.isFinite(numeric) || numeric <= 0) {
return fallback;
}
return Math.floor(numeric);
}
function normalizeText(input = '') {
return String(input).toLowerCase();
}
function tokenizeQuery(input = '') {
const normalized = normalizeText(input);
const tokens = [];
const latinTokens = normalized.match(/[a-z0-9_]{2,}/g) || [];
tokens.push(...latinTokens);
const chineseSegments = normalized
.replace(/[^\u4e00-\u9fff]+/g, ' ')
.split(/\s+/)
.filter(Boolean);
for (const segment of chineseSegments) {
if (segment.length <= 4) {
tokens.push(segment);
continue;
}
const limit = Math.min(segment.length - 1, 40);
for (let i = 0; i < limit; i += 1) {
tokens.push(segment.slice(i, i + 2));
}
tokens.push(segment.slice(0, 4));
}
return [...new Set(tokens)]
.filter((token) => token.length >= 2 && !STOPWORDS.has(token))
.slice(0, 180);
}
function splitLongText(text, maxLen) {
const chunks = [];
for (let index = 0; index < text.length; index += maxLen) {
chunks.push(text.slice(index, index + maxLen));
}
return chunks;
}
function extractYearsFromText(text = '') {
const matches = String(text).match(/20\d{2}/g) || [];
const years = matches
.map((value) => Number(value))
.filter((value) => Number.isInteger(value) && value >= 2000 && value <= 2100);
return [...new Set(years)];
}
function splitDocumentIntoChunks(fileName, content, maxChunkChars) {
const blocks = String(content)
.split(/\n{2,}/)
.map((block) => block.trim())
.filter(Boolean);
const chunks = [];
const defaultHeading = fileName.replace(/\.md$/i, '');
let heading = defaultHeading;
let buffer = '';
let sequence = 0;
const pushChunk = () => {
const text = buffer.trim();
if (!text) {
buffer = '';
return;
}
sequence += 1;
chunks.push({
id: `${fileName}:${sequence}`,
fileName,
heading,
text,
normalized: normalizeText(text),
charLength: text.length,
});
buffer = '';
};
for (const block of blocks) {
const headingMatch = block.match(/^#{1,6}\s+(.+)$/);
if (headingMatch) {
pushChunk();
heading = headingMatch[1].trim() || heading;
continue;
}
if (!buffer) {
if (block.length <= maxChunkChars) {
buffer = block;
continue;
}
const oversizedParts = splitLongText(block, maxChunkChars);
for (const part of oversizedParts) {
sequence += 1;
chunks.push({
id: `${fileName}:${sequence}`,
fileName,
heading,
text: part,
normalized: normalizeText(part),
charLength: part.length,
});
}
continue;
}
if (buffer.length + block.length + 2 <= maxChunkChars) {
buffer += `\n\n${block}`;
continue;
}
pushChunk();
if (block.length <= maxChunkChars) {
buffer = block;
continue;
}
const oversizedParts = splitLongText(block, maxChunkChars);
for (const part of oversizedParts) {
sequence += 1;
chunks.push({
id: `${fileName}:${sequence}`,
fileName,
heading,
text: part,
normalized: normalizeText(part),
charLength: part.length,
});
}
}
pushChunk();
return chunks;
}
function scoreChunk(chunk, queryTokens, options = {}) {
let score = 0;
for (const token of queryTokens) {
if (!token) continue;
if (chunk.normalized.includes(token)) {
if (token.length >= 4) {
score += 4;
} else if (token.length === 3) {
score += 2;
} else {
score += 1;
}
}
if (chunk.heading.includes(token)) {
score += 2;
}
}
const preferredYear = Number(options.preferredYear);
if (Number.isInteger(preferredYear)) {
const years = extractYearsFromText(`${chunk.heading}\n${chunk.text}`);
if (years.length) {
if (years.includes(preferredYear)) {
score += 4;
} else {
score -= 2;
}
}
}
return score;
}
class KnowledgeBase {
constructor(options = {}) {
this.enabled = options.enabled !== false;
this.documentsDir = options.documentsDir;
this.chunkSize = toPositiveInteger(options.chunkSize, 1200);
this.topK = toPositiveInteger(options.topK, 5);
this.maxContextChars = toPositiveInteger(options.maxContextChars, 7000);
this.minScore = toPositiveInteger(options.minScore, 2);
this.defaultKeywords =
options.defaultKeywords && options.defaultKeywords.length
? options.defaultKeywords
: ['八字', '命理', '运势', '五行', '开运', '风水'];
this.docs = [];
this.chunks = [];
this.lastLoadedAt = null;
this.lastError = null;
this.loadingPromise = null;
}
async ensureLoaded(force = false) {
if (!this.enabled) {
return;
}
if (!force && this.lastLoadedAt && this.chunks.length) {
return;
}
if (this.loadingPromise) {
await this.loadingPromise;
return;
}
this.loadingPromise = this.loadFromDisk();
try {
await this.loadingPromise;
} finally {
this.loadingPromise = null;
}
}
async loadFromDisk() {
try {
const entries = await fs.readdir(this.documentsDir, { withFileTypes: true });
const markdownFiles = entries
.filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith('.md'))
.map((entry) => entry.name)
.sort((a, b) => a.localeCompare(b, 'zh-CN'));
const docs = [];
const chunks = [];
for (const fileName of markdownFiles) {
const filePath = path.join(this.documentsDir, fileName);
const content = await fs.readFile(filePath, 'utf8');
docs.push({ fileName, charLength: content.length });
chunks.push(...splitDocumentIntoChunks(fileName, content, this.chunkSize));
}
this.docs = docs;
this.chunks = chunks;
this.lastLoadedAt = new Date();
this.lastError = null;
console.log(
`[knowledge] loaded ${docs.length} docs, ${chunks.length} chunks from ${this.documentsDir}`
);
} catch (error) {
this.lastError = error.message;
console.error(`[knowledge] load failed: ${error.message}`);
throw error;
}
}
async getStatus() {
if (!this.enabled) {
return {
enabled: false,
documents_dir: this.documentsDir,
docs_count: 0,
chunks_count: 0,
last_loaded_at: null,
last_error: null,
};
}
await this.ensureLoaded();
return {
enabled: true,
documents_dir: this.documentsDir,
docs_count: this.docs.length,
chunks_count: this.chunks.length,
last_loaded_at: this.lastLoadedAt ? this.lastLoadedAt.toISOString() : null,
last_error: this.lastError,
};
}
async buildContext(queryText, options = {}) {
if (!this.enabled) {
return {
enabled: false,
text: '',
references: [],
};
}
await this.ensureLoaded();
if (!this.chunks.length) {
return {
enabled: true,
text: '',
references: [],
};
}
const includeFiles = Array.isArray(options.includeFiles)
? options.includeFiles.filter(Boolean)
: null;
const excludeFiles = Array.isArray(options.excludeFiles)
? options.excludeFiles.filter(Boolean)
: null;
const topK = toPositiveInteger(options.topK, this.topK);
const maxContextChars = toPositiveInteger(
options.maxContextChars,
this.maxContextChars
);
const preferredYear = Number.isInteger(Number(options.preferredYear))
? Number(options.preferredYear)
: null;
const dropMismatchedYears = options.dropMismatchedYears === true;
const candidateChunks = this.chunks.filter((chunk) => {
if (includeFiles && includeFiles.length) {
const matched = includeFiles.some((fileName) => chunk.fileName.includes(fileName));
if (!matched) return false;
}
if (excludeFiles && excludeFiles.length) {
const blocked = excludeFiles.some((fileName) => chunk.fileName.includes(fileName));
if (blocked) return false;
}
if (dropMismatchedYears && preferredYear) {
const years = extractYearsFromText(`${chunk.heading}\n${chunk.text}`);
if (years.length && !years.includes(preferredYear)) {
return false;
}
}
return true;
});
if (!candidateChunks.length) {
return {
enabled: true,
text: '',
references: [],
};
}
const queryWithFallback = `${queryText || ''} ${this.defaultKeywords.join(' ')}`;
const queryTokens = tokenizeQuery(queryWithFallback);
const ranked = candidateChunks
.map((chunk) => ({
chunk,
score: scoreChunk(chunk, queryTokens, { preferredYear }),
}))
.filter((item) => item.score >= this.minScore)
.sort((a, b) => b.score - a.score);
const selected = [];
const selectedIds = new Set();
let totalChars = 0;
for (const item of ranked) {
if (selected.length >= topK) break;
if (selectedIds.has(item.chunk.id)) continue;
if (totalChars + item.chunk.charLength > maxContextChars && selected.length > 0) {
continue;
}
selected.push(item);
selectedIds.add(item.chunk.id);
totalChars += item.chunk.charLength;
}
if (!selected.length) {
selected.push(
...candidateChunks
.slice(0, Math.min(topK, candidateChunks.length))
.map((chunk) => ({
chunk,
score: 0,
}))
);
}
const references = selected.map((item) => ({
file: item.chunk.fileName,
heading: item.chunk.heading,
score: item.score,
}));
const text = selected
.map(
(item, index) =>
`[资料${index + 1}] 来源: ${item.chunk.fileName} / ${item.chunk.heading}\n${item.chunk.text}`
)
.join('\n\n');
return {
enabled: true,
text,
references,
};
}
}
module.exports = {
KnowledgeBase,
};