457 lines
11 KiB
JavaScript
457 lines
11 KiB
JavaScript
const fs = require('fs/promises');
|
|
const path = require('path');
|
|
|
|
const STOPWORDS = new Set([
|
|
'我们',
|
|
'你们',
|
|
'他们',
|
|
'可以',
|
|
'如果',
|
|
'因为',
|
|
'所以',
|
|
'然后',
|
|
'这个',
|
|
'那个',
|
|
'一个',
|
|
'一些',
|
|
'一下',
|
|
'请问',
|
|
'希望',
|
|
'想要',
|
|
'了解',
|
|
'知道',
|
|
'关于',
|
|
'以及',
|
|
'还有',
|
|
'是否',
|
|
'怎么',
|
|
'如何',
|
|
'什么',
|
|
'就是',
|
|
'目前',
|
|
'最近',
|
|
'近期',
|
|
'今天',
|
|
'今年',
|
|
'明年',
|
|
'时候',
|
|
'方面',
|
|
'内容',
|
|
'问题',
|
|
]);
|
|
|
|
function toPositiveInteger(value, fallback) {
|
|
const numeric = Number(value);
|
|
if (!Number.isFinite(numeric) || numeric <= 0) {
|
|
return fallback;
|
|
}
|
|
return Math.floor(numeric);
|
|
}
|
|
|
|
function normalizeText(input = '') {
|
|
return String(input).toLowerCase();
|
|
}
|
|
|
|
function tokenizeQuery(input = '') {
|
|
const normalized = normalizeText(input);
|
|
const tokens = [];
|
|
|
|
const latinTokens = normalized.match(/[a-z0-9_]{2,}/g) || [];
|
|
tokens.push(...latinTokens);
|
|
|
|
const chineseSegments = normalized
|
|
.replace(/[^\u4e00-\u9fff]+/g, ' ')
|
|
.split(/\s+/)
|
|
.filter(Boolean);
|
|
|
|
for (const segment of chineseSegments) {
|
|
if (segment.length <= 4) {
|
|
tokens.push(segment);
|
|
continue;
|
|
}
|
|
|
|
const limit = Math.min(segment.length - 1, 40);
|
|
for (let i = 0; i < limit; i += 1) {
|
|
tokens.push(segment.slice(i, i + 2));
|
|
}
|
|
tokens.push(segment.slice(0, 4));
|
|
}
|
|
|
|
return [...new Set(tokens)]
|
|
.filter((token) => token.length >= 2 && !STOPWORDS.has(token))
|
|
.slice(0, 180);
|
|
}
|
|
|
|
function splitLongText(text, maxLen) {
|
|
const chunks = [];
|
|
for (let index = 0; index < text.length; index += maxLen) {
|
|
chunks.push(text.slice(index, index + maxLen));
|
|
}
|
|
return chunks;
|
|
}
|
|
|
|
function extractYearsFromText(text = '') {
|
|
const matches = String(text).match(/20\d{2}/g) || [];
|
|
const years = matches
|
|
.map((value) => Number(value))
|
|
.filter((value) => Number.isInteger(value) && value >= 2000 && value <= 2100);
|
|
|
|
return [...new Set(years)];
|
|
}
|
|
|
|
function splitDocumentIntoChunks(fileName, content, maxChunkChars) {
|
|
const blocks = String(content)
|
|
.split(/\n{2,}/)
|
|
.map((block) => block.trim())
|
|
.filter(Boolean);
|
|
|
|
const chunks = [];
|
|
const defaultHeading = fileName.replace(/\.md$/i, '');
|
|
|
|
let heading = defaultHeading;
|
|
let buffer = '';
|
|
let sequence = 0;
|
|
|
|
const pushChunk = () => {
|
|
const text = buffer.trim();
|
|
if (!text) {
|
|
buffer = '';
|
|
return;
|
|
}
|
|
|
|
sequence += 1;
|
|
chunks.push({
|
|
id: `${fileName}:${sequence}`,
|
|
fileName,
|
|
heading,
|
|
text,
|
|
normalized: normalizeText(text),
|
|
charLength: text.length,
|
|
});
|
|
|
|
buffer = '';
|
|
};
|
|
|
|
for (const block of blocks) {
|
|
const headingMatch = block.match(/^#{1,6}\s+(.+)$/);
|
|
if (headingMatch) {
|
|
pushChunk();
|
|
heading = headingMatch[1].trim() || heading;
|
|
continue;
|
|
}
|
|
|
|
if (!buffer) {
|
|
if (block.length <= maxChunkChars) {
|
|
buffer = block;
|
|
continue;
|
|
}
|
|
|
|
const oversizedParts = splitLongText(block, maxChunkChars);
|
|
for (const part of oversizedParts) {
|
|
sequence += 1;
|
|
chunks.push({
|
|
id: `${fileName}:${sequence}`,
|
|
fileName,
|
|
heading,
|
|
text: part,
|
|
normalized: normalizeText(part),
|
|
charLength: part.length,
|
|
});
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (buffer.length + block.length + 2 <= maxChunkChars) {
|
|
buffer += `\n\n${block}`;
|
|
continue;
|
|
}
|
|
|
|
pushChunk();
|
|
|
|
if (block.length <= maxChunkChars) {
|
|
buffer = block;
|
|
continue;
|
|
}
|
|
|
|
const oversizedParts = splitLongText(block, maxChunkChars);
|
|
for (const part of oversizedParts) {
|
|
sequence += 1;
|
|
chunks.push({
|
|
id: `${fileName}:${sequence}`,
|
|
fileName,
|
|
heading,
|
|
text: part,
|
|
normalized: normalizeText(part),
|
|
charLength: part.length,
|
|
});
|
|
}
|
|
}
|
|
|
|
pushChunk();
|
|
return chunks;
|
|
}
|
|
|
|
function scoreChunk(chunk, queryTokens, options = {}) {
|
|
let score = 0;
|
|
|
|
for (const token of queryTokens) {
|
|
if (!token) continue;
|
|
|
|
if (chunk.normalized.includes(token)) {
|
|
if (token.length >= 4) {
|
|
score += 4;
|
|
} else if (token.length === 3) {
|
|
score += 2;
|
|
} else {
|
|
score += 1;
|
|
}
|
|
}
|
|
|
|
if (chunk.heading.includes(token)) {
|
|
score += 2;
|
|
}
|
|
}
|
|
|
|
const preferredYear = Number(options.preferredYear);
|
|
if (Number.isInteger(preferredYear)) {
|
|
const years = extractYearsFromText(`${chunk.heading}\n${chunk.text}`);
|
|
if (years.length) {
|
|
if (years.includes(preferredYear)) {
|
|
score += 4;
|
|
} else {
|
|
score -= 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
class KnowledgeBase {
|
|
constructor(options = {}) {
|
|
this.enabled = options.enabled !== false;
|
|
this.documentsDir = options.documentsDir;
|
|
this.chunkSize = toPositiveInteger(options.chunkSize, 1200);
|
|
this.topK = toPositiveInteger(options.topK, 5);
|
|
this.maxContextChars = toPositiveInteger(options.maxContextChars, 7000);
|
|
this.minScore = toPositiveInteger(options.minScore, 2);
|
|
this.defaultKeywords =
|
|
options.defaultKeywords && options.defaultKeywords.length
|
|
? options.defaultKeywords
|
|
: ['八字', '命理', '运势', '五行', '开运', '风水'];
|
|
|
|
this.docs = [];
|
|
this.chunks = [];
|
|
this.lastLoadedAt = null;
|
|
this.lastError = null;
|
|
this.loadingPromise = null;
|
|
}
|
|
|
|
async ensureLoaded(force = false) {
|
|
if (!this.enabled) {
|
|
return;
|
|
}
|
|
|
|
if (!force && this.lastLoadedAt && this.chunks.length) {
|
|
return;
|
|
}
|
|
|
|
if (this.loadingPromise) {
|
|
await this.loadingPromise;
|
|
return;
|
|
}
|
|
|
|
this.loadingPromise = this.loadFromDisk();
|
|
|
|
try {
|
|
await this.loadingPromise;
|
|
} finally {
|
|
this.loadingPromise = null;
|
|
}
|
|
}
|
|
|
|
async loadFromDisk() {
|
|
try {
|
|
const entries = await fs.readdir(this.documentsDir, { withFileTypes: true });
|
|
const markdownFiles = entries
|
|
.filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith('.md'))
|
|
.map((entry) => entry.name)
|
|
.sort((a, b) => a.localeCompare(b, 'zh-CN'));
|
|
|
|
const docs = [];
|
|
const chunks = [];
|
|
|
|
for (const fileName of markdownFiles) {
|
|
const filePath = path.join(this.documentsDir, fileName);
|
|
const content = await fs.readFile(filePath, 'utf8');
|
|
docs.push({ fileName, charLength: content.length });
|
|
chunks.push(...splitDocumentIntoChunks(fileName, content, this.chunkSize));
|
|
}
|
|
|
|
this.docs = docs;
|
|
this.chunks = chunks;
|
|
this.lastLoadedAt = new Date();
|
|
this.lastError = null;
|
|
|
|
console.log(
|
|
`[knowledge] loaded ${docs.length} docs, ${chunks.length} chunks from ${this.documentsDir}`
|
|
);
|
|
} catch (error) {
|
|
this.lastError = error.message;
|
|
console.error(`[knowledge] load failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async getStatus() {
|
|
if (!this.enabled) {
|
|
return {
|
|
enabled: false,
|
|
documents_dir: this.documentsDir,
|
|
docs_count: 0,
|
|
chunks_count: 0,
|
|
last_loaded_at: null,
|
|
last_error: null,
|
|
};
|
|
}
|
|
|
|
await this.ensureLoaded();
|
|
|
|
return {
|
|
enabled: true,
|
|
documents_dir: this.documentsDir,
|
|
docs_count: this.docs.length,
|
|
chunks_count: this.chunks.length,
|
|
last_loaded_at: this.lastLoadedAt ? this.lastLoadedAt.toISOString() : null,
|
|
last_error: this.lastError,
|
|
};
|
|
}
|
|
|
|
async buildContext(queryText, options = {}) {
|
|
if (!this.enabled) {
|
|
return {
|
|
enabled: false,
|
|
text: '',
|
|
references: [],
|
|
};
|
|
}
|
|
|
|
await this.ensureLoaded();
|
|
|
|
if (!this.chunks.length) {
|
|
return {
|
|
enabled: true,
|
|
text: '',
|
|
references: [],
|
|
};
|
|
}
|
|
|
|
const includeFiles = Array.isArray(options.includeFiles)
|
|
? options.includeFiles.filter(Boolean)
|
|
: null;
|
|
const excludeFiles = Array.isArray(options.excludeFiles)
|
|
? options.excludeFiles.filter(Boolean)
|
|
: null;
|
|
const topK = toPositiveInteger(options.topK, this.topK);
|
|
const maxContextChars = toPositiveInteger(
|
|
options.maxContextChars,
|
|
this.maxContextChars
|
|
);
|
|
const preferredYear = Number.isInteger(Number(options.preferredYear))
|
|
? Number(options.preferredYear)
|
|
: null;
|
|
const dropMismatchedYears = options.dropMismatchedYears === true;
|
|
|
|
const candidateChunks = this.chunks.filter((chunk) => {
|
|
if (includeFiles && includeFiles.length) {
|
|
const matched = includeFiles.some((fileName) => chunk.fileName.includes(fileName));
|
|
if (!matched) return false;
|
|
}
|
|
|
|
if (excludeFiles && excludeFiles.length) {
|
|
const blocked = excludeFiles.some((fileName) => chunk.fileName.includes(fileName));
|
|
if (blocked) return false;
|
|
}
|
|
|
|
if (dropMismatchedYears && preferredYear) {
|
|
const years = extractYearsFromText(`${chunk.heading}\n${chunk.text}`);
|
|
if (years.length && !years.includes(preferredYear)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
});
|
|
|
|
if (!candidateChunks.length) {
|
|
return {
|
|
enabled: true,
|
|
text: '',
|
|
references: [],
|
|
};
|
|
}
|
|
|
|
const queryWithFallback = `${queryText || ''} ${this.defaultKeywords.join(' ')}`;
|
|
const queryTokens = tokenizeQuery(queryWithFallback);
|
|
|
|
const ranked = candidateChunks
|
|
.map((chunk) => ({
|
|
chunk,
|
|
score: scoreChunk(chunk, queryTokens, { preferredYear }),
|
|
}))
|
|
.filter((item) => item.score >= this.minScore)
|
|
.sort((a, b) => b.score - a.score);
|
|
|
|
const selected = [];
|
|
const selectedIds = new Set();
|
|
let totalChars = 0;
|
|
|
|
for (const item of ranked) {
|
|
if (selected.length >= topK) break;
|
|
if (selectedIds.has(item.chunk.id)) continue;
|
|
|
|
if (totalChars + item.chunk.charLength > maxContextChars && selected.length > 0) {
|
|
continue;
|
|
}
|
|
|
|
selected.push(item);
|
|
selectedIds.add(item.chunk.id);
|
|
totalChars += item.chunk.charLength;
|
|
}
|
|
|
|
if (!selected.length) {
|
|
selected.push(
|
|
...candidateChunks
|
|
.slice(0, Math.min(topK, candidateChunks.length))
|
|
.map((chunk) => ({
|
|
chunk,
|
|
score: 0,
|
|
}))
|
|
);
|
|
}
|
|
|
|
const references = selected.map((item) => ({
|
|
file: item.chunk.fileName,
|
|
heading: item.chunk.heading,
|
|
score: item.score,
|
|
}));
|
|
|
|
const text = selected
|
|
.map(
|
|
(item, index) =>
|
|
`[资料${index + 1}] 来源: ${item.chunk.fileName} / ${item.chunk.heading}\n${item.chunk.text}`
|
|
)
|
|
.join('\n\n');
|
|
|
|
return {
|
|
enabled: true,
|
|
text,
|
|
references,
|
|
};
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
KnowledgeBase,
|
|
};
|