aps更新
This commit is contained in:
@@ -7,6 +7,8 @@ import { config, datasets } from './config.js';
|
||||
import { sendLoginAlert, sendRuntimeErrorAlert } from './notify.js';
|
||||
import {
|
||||
closeDbPool,
|
||||
getExistingMessageIds,
|
||||
getExistingMessageFingerprints,
|
||||
getLatestBillConsumptionTimeFromDb,
|
||||
getLatestMessageTimeFromDb,
|
||||
getLatestOrderTimeFromDb,
|
||||
@@ -30,6 +32,7 @@ import {
|
||||
} from './storage.js';
|
||||
|
||||
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
const scheduleEventFile = path.join(config.dataDir, 'runs', 'schedule-events.jsonl');
|
||||
|
||||
let _context = null;
|
||||
let _runtimeController = null;
|
||||
@@ -37,6 +40,15 @@ let _browser = null;
|
||||
let _isAttachedBrowser = false;
|
||||
const runningJobs = new Set();
|
||||
|
||||
function recordScheduleEvent(payload) {
|
||||
try {
|
||||
fs.mkdirSync(path.dirname(scheduleEventFile), { recursive: true });
|
||||
fs.appendFileSync(scheduleEventFile, `${JSON.stringify({ at: new Date().toISOString(), ...payload })}\n`, 'utf8');
|
||||
} catch (error) {
|
||||
console.warn(`[schedule-event] 写入失败: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const AUTH_PAGE_KEYWORDS = [
|
||||
'RAM 用户登录',
|
||||
'主账号登录',
|
||||
@@ -318,6 +330,12 @@ function subtractDays(dateValue, days) {
|
||||
return next;
|
||||
}
|
||||
|
||||
function subtractMonths(dateValue, months) {
|
||||
const next = new Date(dateValue);
|
||||
next.setMonth(next.getMonth() - months);
|
||||
return next;
|
||||
}
|
||||
|
||||
function randomIntBetween(min, max) {
|
||||
return Math.floor(Math.random() * (max - min + 1)) + min;
|
||||
}
|
||||
@@ -505,7 +523,7 @@ async function captureErrorArtifacts(page, metadata = {}) {
|
||||
fs.writeFileSync(jsonPath, JSON.stringify(payload, null, 2));
|
||||
|
||||
let screenshotSaved = false;
|
||||
if (page) {
|
||||
if (page && !page.isClosed?.()) {
|
||||
try {
|
||||
await page.screenshot({ path: screenshotPath, fullPage: true, timeout: 5000, animations: 'disabled' });
|
||||
screenshotSaved = true;
|
||||
@@ -679,7 +697,9 @@ export async function syncBillsOnly(options = {}) {
|
||||
await reportRuntimeError(error, page, { label: 'syncBillsOnly', dataset: 'bills', mode: options.incremental ? 'incremental' : 'full' });
|
||||
throw error;
|
||||
} finally {
|
||||
if (config.closeBrowser) {
|
||||
if (options.keepBrowserOpen === true) {
|
||||
console.log('浏览器保持运行(schedule bills)');
|
||||
} else if (config.closeBrowser) {
|
||||
await closeContextIfNeeded();
|
||||
} else {
|
||||
console.log('浏览器保持运行');
|
||||
@@ -755,13 +775,28 @@ export async function syncMessagesOnly(options = {}) {
|
||||
}
|
||||
|
||||
export async function scheduleSync() {
|
||||
console.log(`定时任务已启动: normal=${config.cron}, hot=${config.hotCron} (${config.timezone})`);
|
||||
console.log(`定时任务已启动: bills=${config.cron}, hot=${config.hotCron} (${config.timezone})`);
|
||||
setInterval(() => {
|
||||
console.log(`[${new Date().toISOString()}] 定时守护存活中: bills=${config.cron}, hot=${config.hotCron}, mode=${config.scheduleMode}`);
|
||||
}, 60 * 1000);
|
||||
cron.schedule(
|
||||
config.cron,
|
||||
async () => {
|
||||
if (config.scheduleMode === 'hot') {
|
||||
return;
|
||||
return runLockedJob('schedule-shared', async () => {
|
||||
try {
|
||||
recordScheduleEvent({ track: 'bills', status: 'started', mode: 'bills-incremental' });
|
||||
console.log(`[${new Date().toISOString()}] 开始执行账单定时同步 mode=bills-incremental`);
|
||||
const summary = await syncBillsOnly({ incremental: true, keepBrowserOpen: true });
|
||||
recordScheduleEvent({ track: 'bills', status: 'completed', mode: 'bills-incremental', summary });
|
||||
console.log(`[${new Date().toISOString()}] 账单定时同步完成`, JSON.stringify(summary, null, 2));
|
||||
} catch (error) {
|
||||
recordScheduleEvent({ track: 'bills', status: 'failed', mode: 'bills-incremental', error: error.message });
|
||||
console.error(`[${new Date().toISOString()}] 账单定时同步失败`, error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
console.log(`[${new Date().toISOString()}] 开始执行同步 mode=${config.scheduleMode}`);
|
||||
const summary = config.scheduleMode === 'full'
|
||||
@@ -781,13 +816,22 @@ export async function scheduleSync() {
|
||||
if (config.scheduleMode !== 'hot') {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
console.log(`[${new Date().toISOString()}] 开始执行高频同步 mode=hot`);
|
||||
const summary = await syncHot();
|
||||
console.log(`[${new Date().toISOString()}] 高频同步完成`, JSON.stringify(summary, null, 2));
|
||||
} catch (error) {
|
||||
console.error(`[${new Date().toISOString()}] 高频同步失败`, error);
|
||||
}
|
||||
return runLockedJob('schedule-shared', async () => {
|
||||
try {
|
||||
recordScheduleEvent({ track: 'hot', status: 'started', mode: 'hot' });
|
||||
console.log(`[${new Date().toISOString()}] 开始执行高频同步 mode=hot`);
|
||||
const summary = await syncHot({ keepBrowserOpen: true });
|
||||
if (summary?.skipped) {
|
||||
recordScheduleEvent({ track: 'hot', status: 'skipped', mode: 'hot', reason: summary.reason || 'already_running' });
|
||||
} else {
|
||||
recordScheduleEvent({ track: 'hot', status: 'completed', mode: 'hot', summary });
|
||||
}
|
||||
console.log(`[${new Date().toISOString()}] 高频同步完成`, JSON.stringify(summary, null, 2));
|
||||
} catch (error) {
|
||||
recordScheduleEvent({ track: 'hot', status: 'failed', mode: 'hot', error: error.message });
|
||||
console.error(`[${new Date().toISOString()}] 高频同步失败`, error);
|
||||
}
|
||||
});
|
||||
},
|
||||
{ timezone: config.timezone },
|
||||
);
|
||||
@@ -826,7 +870,9 @@ export async function syncHot(options = {}) {
|
||||
await reportRuntimeError(error, page, { label: 'syncHot', dataset: 'hot', mode: 'hot' });
|
||||
throw error;
|
||||
} finally {
|
||||
if (config.closeBrowser) {
|
||||
if (options.keepBrowserOpen === true) {
|
||||
console.log('浏览器保持运行(schedule hot)');
|
||||
} else if (config.closeBrowser) {
|
||||
await closeContextIfNeeded();
|
||||
} else {
|
||||
console.log('浏览器保持运行');
|
||||
@@ -1274,6 +1320,7 @@ async function syncMessages(page, options = {}) {
|
||||
await runtimeCheckpoint('同步消息');
|
||||
const dataset = datasets.messages;
|
||||
const { incremental = false, resume = false, hot = false } = options;
|
||||
const fullSyncWatermark = !incremental && !hot ? subtractMonths(new Date(), 3) : null;
|
||||
await page.goto(dataset.url, { waitUntil: 'domcontentloaded' });
|
||||
await waitUntilReady(page, dataset.heading);
|
||||
await trySetPageSize(page, dataset.pageSize);
|
||||
@@ -1283,17 +1330,36 @@ async function syncMessages(page, options = {}) {
|
||||
let shouldContinueScrape = true;
|
||||
let allNormalizedRecords = Array.isArray(resumeCheckpoint?.records) ? resumeCheckpoint.records : [];
|
||||
|
||||
const shouldStopForFullSyncPage = (pageRows) => {
|
||||
if (!fullSyncWatermark) {
|
||||
return false;
|
||||
}
|
||||
const normalizedPageRows = normalizeDatasetRecords(dataset, pageRows, {});
|
||||
const pageTimeStats = getMessagePageTimeStats(normalizedPageRows);
|
||||
console.log(`[全量模式] 当前页时间范围: parsed=${pageTimeStats.parsed}/${pageTimeStats.total}, earliest=${pageTimeStats.earliest || 'N/A'}, latest=${pageTimeStats.latest || 'N/A'}, watermark=${formatDateTime(fullSyncWatermark)}`);
|
||||
return normalizedPageRows.length > 0
|
||||
&& normalizedPageRows.every((record) => !isAfterLatestMessageTime(record, fullSyncWatermark));
|
||||
};
|
||||
|
||||
if (resumeFromPage > 0) {
|
||||
console.log(`[消息续爬] 从 checkpoint 恢复: page=${resumeFromPage}, records=${allNormalizedRecords.length}`);
|
||||
const moved = await moveMessagesToResumeStart(page, resumeFromPage);
|
||||
if (!moved) {
|
||||
console.log('[消息续爬] checkpoint 已在最后一页,无需继续抓取');
|
||||
shouldContinueScrape = false;
|
||||
} else if (fullSyncWatermark) {
|
||||
await waitForTableRows(page);
|
||||
const resumedPageData = await extractTable(page);
|
||||
if (shouldStopForFullSyncPage(resumedPageData.rows)) {
|
||||
console.log(`[全量模式] 当前续爬页已超出近三个月范围,停止继续抓取: page=${resumeFromPage + 1}, watermark=${formatDateTime(fullSyncWatermark)}`);
|
||||
shouldContinueScrape = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let records = [];
|
||||
let hotWatermark = null;
|
||||
let stopByExistingPage = false;
|
||||
if (hot && hasDbConfig()) {
|
||||
const latestMessageTime = await getLatestMessageTimeFromDb();
|
||||
const latest = parseDbDateTime(latestMessageTime);
|
||||
@@ -1304,11 +1370,39 @@ async function syncMessages(page, options = {}) {
|
||||
if (shouldContinueScrape) {
|
||||
records = await scrapePagedTable(page, dataset, {}, {
|
||||
onPage: async ({ pageNum, pageRows }) => {
|
||||
const normalizedPageRows = normalizeDatasetRecords(dataset, pageRows, {});
|
||||
allNormalizedRecords.push(...normalizedPageRows);
|
||||
if (hasDbConfig()) {
|
||||
await upsertMessages(normalizedPageRows);
|
||||
const detailedPageRows = await enrichMessageRowsWithDetails(page, pageRows, pageNum);
|
||||
const normalizedPageRows = normalizeDatasetRecords(dataset, detailedPageRows, {});
|
||||
const filteredPageRows = fullSyncWatermark
|
||||
? normalizedPageRows.filter((record) => isAfterLatestMessageTime(record, fullSyncWatermark))
|
||||
: normalizedPageRows;
|
||||
let pageRowsToPersist = filteredPageRows;
|
||||
if (hasDbConfig() && filteredPageRows.length > 0) {
|
||||
const pageMsgIds = filteredPageRows.map((record) => record.msgId).filter(Boolean);
|
||||
const existingIds = await getExistingMessageIds(pageMsgIds);
|
||||
const fingerprintCandidates = filteredPageRows
|
||||
.map((record) => String(record.receivedAt || record.gmtModified || record.gmtCreated || '').trim())
|
||||
.filter(Boolean);
|
||||
const existingFingerprintRows = await getExistingMessageFingerprints(fingerprintCandidates);
|
||||
const existingFingerprints = new Set(
|
||||
existingFingerprintRows.map((row) => buildMessageFingerprint({ title: row.title, receivedAt: row.received_at, orderNo: row.order_no })),
|
||||
);
|
||||
stopByExistingPage = filteredPageRows.length > 0 && filteredPageRows.every((record) => {
|
||||
if (record.msgId) {
|
||||
return existingIds.has(record.msgId);
|
||||
}
|
||||
return existingFingerprints.has(buildMessageFingerprint(record));
|
||||
});
|
||||
pageRowsToPersist = filteredPageRows.filter((record) => {
|
||||
if (record.msgId) {
|
||||
return !existingIds.has(record.msgId);
|
||||
}
|
||||
return !existingFingerprints.has(buildMessageFingerprint(record));
|
||||
});
|
||||
if (pageRowsToPersist.length > 0) {
|
||||
await upsertMessages(pageRowsToPersist);
|
||||
}
|
||||
}
|
||||
allNormalizedRecords.push(...pageRowsToPersist);
|
||||
await saveMessagesCheckpoint(dataset, pageNum, allNormalizedRecords);
|
||||
},
|
||||
skipInitialPage: resumeFromPage > 0,
|
||||
@@ -1317,13 +1411,22 @@ async function syncMessages(page, options = {}) {
|
||||
if (pageNum >= config.hotMessageMaxPagesPerRun) {
|
||||
return true;
|
||||
}
|
||||
if (stopByExistingPage) {
|
||||
return true;
|
||||
}
|
||||
if (!hotWatermark) {
|
||||
return false;
|
||||
}
|
||||
const normalizedPageRows = normalizeDatasetRecords(dataset, pageRows, {});
|
||||
const detailedPageRows = await enrichMessageRowsWithDetails(page, pageRows, pageNum);
|
||||
const normalizedPageRows = normalizeDatasetRecords(dataset, detailedPageRows, {});
|
||||
return normalizedPageRows.length > 0
|
||||
&& normalizedPageRows.every((record) => !isAfterLatestMessageTime(record, hotWatermark));
|
||||
}
|
||||
: fullSyncWatermark
|
||||
? async ({ pageNum, pageRows }) => {
|
||||
const detailedPageRows = await enrichMessageRowsWithDetails(page, pageRows, pageNum);
|
||||
return stopByExistingPage || shouldStopForFullSyncPage(detailedPageRows);
|
||||
}
|
||||
: undefined,
|
||||
});
|
||||
}
|
||||
@@ -1331,6 +1434,11 @@ async function syncMessages(page, options = {}) {
|
||||
if (resumeFromPage === 0) {
|
||||
allNormalizedRecords = normalizeDatasetRecords(dataset, records, {});
|
||||
}
|
||||
if (fullSyncWatermark) {
|
||||
const before = allNormalizedRecords.length;
|
||||
allNormalizedRecords = allNormalizedRecords.filter((record) => isAfterLatestMessageTime(record, fullSyncWatermark));
|
||||
console.log(`[全量模式] 消息仅保留近三个月: ${before} -> ${allNormalizedRecords.length} (watermark=${formatDateTime(fullSyncWatermark)})`);
|
||||
}
|
||||
if ((incremental || hot) && hasDbConfig()) {
|
||||
try {
|
||||
const latestMessageTime = await getLatestMessageTimeFromDb();
|
||||
@@ -1350,7 +1458,8 @@ async function syncMessages(page, options = {}) {
|
||||
}
|
||||
}
|
||||
|
||||
return persistDataset(dataset, dedupeByHash(allNormalizedRecords), {});
|
||||
const previousState = loadCurrentState(dataset.name, dataset.uniqueKey);
|
||||
return persistNormalizedDataset(dataset, dedupeByHash([...(previousState.records || []), ...allNormalizedRecords]));
|
||||
}
|
||||
|
||||
async function saveMessagesCheckpoint(dataset, pageNum, normalizedRecords) {
|
||||
@@ -1629,15 +1738,53 @@ function isAfterLatestConsumptionDate(record, latestConsumptionDate) {
|
||||
function isAfterLatestMessageTime(record, watermarkDate) {
|
||||
const value = String(record['消息修改时间'] || record['修改时间'] || record.gmtModified || record['消息创建时间'] || record['创建时间'] || record.gmtCreated || '').trim();
|
||||
if (!value) {
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
const parsed = parseDbDateTime(value);
|
||||
if (!parsed) {
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
return parsed >= watermarkDate;
|
||||
}
|
||||
|
||||
function extractMessageTime(record) {
|
||||
const value = String(record['消息修改时间'] || record['修改时间'] || record.gmtModified || record['消息创建时间'] || record['创建时间'] || record.gmtCreated || '').trim();
|
||||
if (!value) {
|
||||
return null;
|
||||
}
|
||||
return parseDbDateTime(value);
|
||||
}
|
||||
|
||||
function getMessagePageTimeStats(records) {
|
||||
const parsedTimes = records
|
||||
.map((record) => extractMessageTime(record))
|
||||
.filter(Boolean)
|
||||
.sort((a, b) => a.getTime() - b.getTime());
|
||||
|
||||
if (parsedTimes.length === 0) {
|
||||
return {
|
||||
total: records.length,
|
||||
parsed: 0,
|
||||
earliest: '',
|
||||
latest: '',
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
total: records.length,
|
||||
parsed: parsedTimes.length,
|
||||
earliest: formatDateTime(parsedTimes[0]),
|
||||
latest: formatDateTime(parsedTimes[parsedTimes.length - 1]),
|
||||
};
|
||||
}
|
||||
|
||||
function buildMessageFingerprint(record) {
|
||||
const title = String(record.title || record.detailTitle || record.column_1 || '').trim();
|
||||
const receivedAt = String(record.receivedAt || record.gmtModified || record.gmtCreated || record.column_2 || '').trim();
|
||||
const orderNo = String(record.orderNo || record.refundOrderNo || '').trim();
|
||||
return `${title}__${receivedAt}__${orderNo}`;
|
||||
}
|
||||
|
||||
async function syncOrderDetails(page, cachedOrderIds, options = {}) {
|
||||
await runtimeCheckpoint('同步订单详情');
|
||||
const dataset = datasets.orderDetails;
|
||||
@@ -1806,6 +1953,20 @@ async function scrapePagedTable(page, dataset, context, options = {}) {
|
||||
const visited = new Set();
|
||||
let shouldSkipCurrentPage = skipInitialPage;
|
||||
|
||||
const describeStopReason = (reason) => {
|
||||
if (!reason) {
|
||||
return 'unknown';
|
||||
}
|
||||
const details = [];
|
||||
if (reason.beforePage != null) {
|
||||
details.push(`before=${reason.beforePage}`);
|
||||
}
|
||||
if (reason.afterPage != null) {
|
||||
details.push(`after=${reason.afterPage}`);
|
||||
}
|
||||
return details.length > 0 ? `${reason.code} (${details.join(', ')})` : reason.code;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
await runtimeCheckpoint(`抓取 ${dataset.name} 分页`);
|
||||
await waitForTableRows(page);
|
||||
@@ -1816,9 +1977,9 @@ async function scrapePagedTable(page, dataset, context, options = {}) {
|
||||
if (shouldSkipCurrentPage) {
|
||||
console.log(`[抓取] 跳过 checkpoint 已保存页: ${pageNum}`);
|
||||
shouldSkipCurrentPage = false;
|
||||
const moved = await gotoNextPage(page);
|
||||
const { moved, reason } = await gotoNextPage(page);
|
||||
if (!moved) {
|
||||
console.log(`[抓取] checkpoint 已位于最后一页,停止`);
|
||||
console.log(`[抓取] checkpoint 已停止续爬: ${describeStopReason(reason)}`);
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
@@ -1839,9 +2000,9 @@ async function scrapePagedTable(page, dataset, context, options = {}) {
|
||||
break;
|
||||
}
|
||||
|
||||
const moved = await gotoNextPage(page);
|
||||
const { moved, reason } = await gotoNextPage(page);
|
||||
if (!moved) {
|
||||
console.log(`[抓取] 翻页失败或已到最后一页,停止`);
|
||||
console.log(`[抓取] 停止翻页: ${describeStopReason(reason)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1875,39 +2036,101 @@ async function extractTable(page) {
|
||||
.replace(/\n\s+/g, '\n')
|
||||
.trim();
|
||||
|
||||
const headerTables = Array.from(document.querySelectorAll('table')).filter((table) => table.querySelectorAll('thead th').length > 1);
|
||||
const headerTable = headerTables.sort((a, b) => b.querySelectorAll('thead th').length - a.querySelectorAll('thead th').length)[0];
|
||||
if (!headerTable) return { headers: [], rows: [] };
|
||||
|
||||
const headers = Array.from(headerTable.querySelectorAll('thead th')).map((cell) => normalize(cell.textContent));
|
||||
const bodyTables = Array.from(document.querySelectorAll('table')).filter((table) => table.querySelectorAll('tbody tr').length > 0);
|
||||
const bodyTable = bodyTables.sort((a, b) => {
|
||||
const aSize = Math.max(...Array.from(a.querySelectorAll('tbody tr')).map((row) => row.querySelectorAll('td').length), 0);
|
||||
const bSize = Math.max(...Array.from(b.querySelectorAll('tbody tr')).map((row) => row.querySelectorAll('td').length), 0);
|
||||
return bSize - aSize;
|
||||
})[0];
|
||||
if (!bodyTable) return { headers, rows: [] };
|
||||
|
||||
const rows = Array.from(bodyTable.querySelectorAll('tbody tr'))
|
||||
.map((row) => Array.from(row.querySelectorAll('td')).map((cell) => normalize(cell.innerText || cell.textContent)))
|
||||
const toRecords = (headers, rows) => rows
|
||||
.map((cells) => cells.map((cell) => normalize(cell)))
|
||||
.filter((cells) => cells.some(Boolean))
|
||||
.map((cells) => {
|
||||
const record = {};
|
||||
headers.forEach((header, index) => {
|
||||
const keys = headers.length ? headers : cells.map((_, index) => `column_${index + 1}`);
|
||||
keys.forEach((header, index) => {
|
||||
record[header || `column_${index + 1}`] = cells[index] || '';
|
||||
});
|
||||
return record;
|
||||
});
|
||||
|
||||
return { headers, rows };
|
||||
const extractFromNativeTables = () => {
|
||||
const headerTables = Array.from(document.querySelectorAll('table')).filter((table) => table.querySelectorAll('thead th').length > 1);
|
||||
const headerTable = headerTables.sort((a, b) => b.querySelectorAll('thead th').length - a.querySelectorAll('thead th').length)[0];
|
||||
const headers = headerTable
|
||||
? Array.from(headerTable.querySelectorAll('thead th')).map((cell) => normalize(cell.textContent))
|
||||
: [];
|
||||
|
||||
const bodyTables = Array.from(document.querySelectorAll('table')).filter((table) => table.querySelectorAll('tbody tr').length > 0);
|
||||
const bodyTable = bodyTables.sort((a, b) => {
|
||||
const aSize = Math.max(...Array.from(a.querySelectorAll('tbody tr')).map((row) => row.querySelectorAll('td').length), 0);
|
||||
const bSize = Math.max(...Array.from(b.querySelectorAll('tbody tr')).map((row) => row.querySelectorAll('td').length), 0);
|
||||
return bSize - aSize;
|
||||
})[0];
|
||||
if (!bodyTable) {
|
||||
return { headers, rows: [] };
|
||||
}
|
||||
|
||||
const rows = Array.from(bodyTable.querySelectorAll('tbody tr'))
|
||||
.map((row) => Array.from(row.querySelectorAll('td')).map((cell) => normalize(cell.innerText || cell.textContent)));
|
||||
return { headers, rows: toRecords(headers, rows) };
|
||||
};
|
||||
|
||||
const extractFromNextTable = () => {
|
||||
const container = document.querySelector('.next-table, .next-table-inner, [class*="next-table"]');
|
||||
if (!container) {
|
||||
return { headers: [], rows: [] };
|
||||
}
|
||||
|
||||
const headers = Array.from(container.querySelectorAll('.next-table-header .next-table-cell, .next-table-header th, [role="columnheader"]'))
|
||||
.map((cell) => normalize(cell.innerText || cell.textContent))
|
||||
.filter(Boolean);
|
||||
|
||||
const rowCandidates = Array.from(container.querySelectorAll('.next-table-body .next-table-row, .next-table-row, [role="row"]'));
|
||||
const rows = rowCandidates
|
||||
.map((row) => {
|
||||
const cells = Array.from(row.querySelectorAll('.next-table-cell, [role="gridcell"], [role="cell"], td'))
|
||||
.map((cell) => normalize(cell.innerText || cell.textContent));
|
||||
return cells;
|
||||
})
|
||||
.filter((cells) => cells.length > 0 && cells.some(Boolean));
|
||||
|
||||
return { headers, rows: toRecords(headers, rows) };
|
||||
};
|
||||
|
||||
const nativeResult = extractFromNativeTables();
|
||||
if (nativeResult.rows.length > 0) {
|
||||
return nativeResult;
|
||||
}
|
||||
|
||||
const nextTableResult = extractFromNextTable();
|
||||
if (nextTableResult.rows.length > 0) {
|
||||
return nextTableResult;
|
||||
}
|
||||
|
||||
return nextTableResult.headers.length > 0 ? nextTableResult : nativeResult;
|
||||
});
|
||||
}
|
||||
|
||||
function isTargetClosedError(error) {
|
||||
const message = String(error?.message || error || '');
|
||||
return message.includes('Target page, context or browser has been closed');
|
||||
}
|
||||
|
||||
function assertPageAvailable(page, label) {
|
||||
if (!page || page.isClosed?.()) {
|
||||
throw new Error(`页面在${label}前已被关闭。请检查是否手动关闭了浏览器,或浏览器是否异常退出,然后重新执行同步。`);
|
||||
}
|
||||
}
|
||||
|
||||
async function waitForTableRows(page) {
|
||||
await runtimeCheckpoint('等待表格加载');
|
||||
assertPageAvailable(page, '等待表格加载');
|
||||
try {
|
||||
await page.waitForFunction(() => document.querySelectorAll('table tbody tr').length > 0, null, { timeout: 120000 });
|
||||
await page.waitForFunction(() => {
|
||||
const nativeRows = document.querySelectorAll('table tbody tr').length;
|
||||
const nextRows = document.querySelectorAll('.next-table-body .next-table-row, .next-table-row, [role="row"]').length;
|
||||
const emptyState = document.querySelector('.next-table-empty, .next-empty, [class*="empty"], [class*="no-data"]');
|
||||
return nativeRows > 0 || nextRows > 0 || Boolean(emptyState);
|
||||
}, null, { timeout: 120000 });
|
||||
} catch (error) {
|
||||
if (isTargetClosedError(error)) {
|
||||
throw new Error('等待消息表格加载时,浏览器页面已被关闭。请勿手动关闭浏览器窗口,并检查浏览器是否异常退出后重试。');
|
||||
}
|
||||
await raiseIfSessionExpired(page, '等待表格加载');
|
||||
throw error;
|
||||
}
|
||||
@@ -1915,12 +2138,21 @@ async function waitForTableRows(page) {
|
||||
}
|
||||
|
||||
async function currentPageNumber(page) {
|
||||
const active = page.locator('.next-pagination-item.next-current');
|
||||
if ((await active.count()) === 0) return 1;
|
||||
return Number.parseInt((await active.first().innerText()).trim(), 10) || 1;
|
||||
assertPageAvailable(page, '读取当前页码');
|
||||
try {
|
||||
const active = page.locator('.next-pagination-item.next-current');
|
||||
if ((await active.count()) === 0) return 1;
|
||||
return Number.parseInt((await active.first().innerText()).trim(), 10) || 1;
|
||||
} catch (error) {
|
||||
if (isTargetClosedError(error)) {
|
||||
throw new Error('读取分页页码时,浏览器页面已被关闭。请勿手动关闭浏览器窗口,并检查浏览器是否异常退出后重试。');
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function jumpToPage(page, targetPage) {
|
||||
async function jumpToPage(page, targetPage, options = {}) {
|
||||
const { allowSequentialFallback = true } = options;
|
||||
if (targetPage <= 1) {
|
||||
return true;
|
||||
}
|
||||
@@ -1949,19 +2181,24 @@ async function jumpToPage(page, targetPage) {
|
||||
await sleep(1500);
|
||||
const afterJump = await currentPageNumber(page);
|
||||
if (afterJump === targetPage) {
|
||||
console.log(`[账单续爬] 已跳转到第 ${targetPage} 页`);
|
||||
console.log(`[跳页] 已跳转到第 ${targetPage} 页`);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
console.warn(`[账单续爬] 未找到可用跳页输入框,尝试顺序翻到第 ${targetPage} 页`);
|
||||
if (!allowSequentialFallback) {
|
||||
console.warn(`[跳页] 未找到可用跳页输入框,且当前模式禁止顺序兜底: target=${targetPage}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
console.warn(`[跳页] 未找到可用跳页输入框,尝试顺序翻到第 ${targetPage} 页`);
|
||||
let guard = 0;
|
||||
while (guard < targetPage + 5) {
|
||||
const currentPage = await currentPageNumber(page);
|
||||
if (currentPage >= targetPage) {
|
||||
return currentPage === targetPage;
|
||||
}
|
||||
const moved = await gotoNextPage(page);
|
||||
const { moved } = await gotoNextPage(page);
|
||||
if (!moved) {
|
||||
return false;
|
||||
}
|
||||
@@ -1972,38 +2209,71 @@ async function jumpToPage(page, targetPage) {
|
||||
|
||||
async function gotoNextPage(page) {
|
||||
await runtimeCheckpoint('翻页');
|
||||
assertPageAvailable(page, '翻页');
|
||||
const before = await currentPageNumber(page);
|
||||
|
||||
// 用 Playwright locator 定位"下一页"按钮
|
||||
const nextBtn = page.locator('button.next-pagination-item.next-next');
|
||||
if ((await nextBtn.count()) === 0) {
|
||||
console.log('[翻页] 未找到下一页按钮');
|
||||
return false;
|
||||
try {
|
||||
// 用 Playwright locator 定位"下一页"按钮
|
||||
const nextBtn = page.locator('button.next-pagination-item.next-next');
|
||||
if ((await nextBtn.count()) === 0) {
|
||||
return {
|
||||
moved: false,
|
||||
reason: { code: 'next_button_missing', beforePage: before },
|
||||
};
|
||||
}
|
||||
|
||||
const disabled = (await nextBtn.getAttribute('disabled')) != null;
|
||||
if (disabled) {
|
||||
return {
|
||||
moved: false,
|
||||
reason: { code: 'next_button_disabled', beforePage: before },
|
||||
};
|
||||
}
|
||||
|
||||
// 用 Playwright click(而非 DOM click),确保 React 事件正常触发
|
||||
await nextBtn.click();
|
||||
await sleep(2000);
|
||||
await raiseIfSessionExpired(page, `翻页 ${before} -> next`);
|
||||
|
||||
const after = await currentPageNumber(page);
|
||||
console.log(`[翻页] ${before} -> ${after}`);
|
||||
|
||||
if (before > 1 && after === 1) {
|
||||
throw new Error(`分页从第 ${before} 页异常回退到第 1 页,疑似登录态失效或页面会话已重置。请重新执行 npm run login 后再继续同步。`);
|
||||
}
|
||||
|
||||
if (after < before) {
|
||||
throw new Error(`分页从第 ${before} 页异常回退到第 ${after} 页,疑似登录态失效或页面状态被重置。请重新执行 npm run login 后再继续同步。`);
|
||||
}
|
||||
|
||||
if (before === after) {
|
||||
const fallbackTarget = before + 1;
|
||||
console.warn(`[翻页] next 点击后页码未推进,尝试跳页到 ${fallbackTarget}`);
|
||||
const jumped = await jumpToPage(page, fallbackTarget, { allowSequentialFallback: false });
|
||||
if (jumped) {
|
||||
const afterJump = await currentPageNumber(page);
|
||||
console.log(`[翻页] fallback jump ${before} -> ${afterJump}`);
|
||||
return {
|
||||
moved: true,
|
||||
reason: { code: 'advanced_via_jump', beforePage: before, afterPage: afterJump },
|
||||
};
|
||||
}
|
||||
return {
|
||||
moved: false,
|
||||
reason: { code: 'page_number_not_advanced', beforePage: before, afterPage: after },
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
moved: true,
|
||||
reason: { code: 'advanced', beforePage: before, afterPage: after },
|
||||
};
|
||||
} catch (error) {
|
||||
if (isTargetClosedError(error)) {
|
||||
throw new Error(`翻页到下一页时,浏览器页面在第 ${before} 页之后被关闭。请勿手动关闭浏览器窗口,并检查浏览器是否异常退出后重试。`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
const disabled = (await nextBtn.getAttribute('disabled')) != null;
|
||||
if (disabled) {
|
||||
console.log('[翻页] 下一页按钮已禁用');
|
||||
return false;
|
||||
}
|
||||
|
||||
// 用 Playwright click(而非 DOM click),确保 React 事件正常触发
|
||||
await nextBtn.click();
|
||||
await sleep(2000);
|
||||
await raiseIfSessionExpired(page, `翻页 ${before} -> next`);
|
||||
|
||||
const after = await currentPageNumber(page);
|
||||
console.log(`[翻页] ${before} -> ${after}`);
|
||||
|
||||
if (before > 1 && after === 1) {
|
||||
throw new Error(`分页从第 ${before} 页异常回退到第 1 页,疑似登录态失效或页面会话已重置。请重新执行 npm run login 后再继续同步。`);
|
||||
}
|
||||
|
||||
if (after < before) {
|
||||
throw new Error(`分页从第 ${before} 页异常回退到第 ${after} 页,疑似登录态失效或页面状态被重置。请重新执行 npm run login 后再继续同步。`);
|
||||
}
|
||||
|
||||
return before !== after;
|
||||
}
|
||||
|
||||
async function trySetPageSize(page, pageSize) {
|
||||
@@ -2455,6 +2725,217 @@ async function waitForStableOrderList(page) {
|
||||
await waitForTableRows(page).catch(() => null);
|
||||
}
|
||||
|
||||
async function clickMessageDetailButton(page, rowText, rowIndex) {
|
||||
const clicked = await page.evaluate(({ rowTextValue, rowIndexValue }) => {
|
||||
const normalize = (value) => String(value || '').replace(/\s+/g, ' ').trim();
|
||||
const target = normalize(rowTextValue);
|
||||
|
||||
const rows = Array.from(document.querySelectorAll('.next-table-row, table tbody tr, [role="row"]'))
|
||||
.filter((row) => normalize(row.innerText || row.textContent || ''));
|
||||
const row = rows[rowIndexValue];
|
||||
if (!row) {
|
||||
return { clicked: false, reason: 'row_not_found', rowCount: rows.length };
|
||||
}
|
||||
|
||||
const rowTextActual = normalize(row.innerText || row.textContent || '');
|
||||
const clickableNodes = Array.from(row.querySelectorAll('button, a, [role="button"], .next-btn-text'));
|
||||
const preferred = clickableNodes.find((node) => {
|
||||
const text = normalize(node.innerText || node.textContent || '');
|
||||
return text && rowTextActual.includes(text);
|
||||
}) || clickableNodes[0];
|
||||
|
||||
if (!preferred) {
|
||||
return { clicked: false, reason: 'clickable_node_not_found', rowTextActual };
|
||||
}
|
||||
|
||||
preferred.scrollIntoView({ block: 'center', inline: 'center', behavior: 'instant' });
|
||||
preferred.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
|
||||
return {
|
||||
clicked: true,
|
||||
buttonText: normalize(preferred.innerText || preferred.textContent || ''),
|
||||
rowTextActual,
|
||||
matched: rowTextActual.includes(target),
|
||||
};
|
||||
}, { rowTextValue: rowText, rowIndexValue: rowIndex }).catch(() => ({ clicked: false, reason: 'evaluate_failed' }));
|
||||
|
||||
if (clicked.clicked) {
|
||||
await sleep(1200);
|
||||
}
|
||||
return clicked;
|
||||
}
|
||||
|
||||
async function waitForMessageDetailDrawer(page) {
|
||||
await page.waitForFunction(() => {
|
||||
const header = document.querySelector('.next-drawer-header');
|
||||
const body = document.querySelector('.next-drawer-body');
|
||||
return !!header && !!body && String(header.textContent || '').includes('消息详情');
|
||||
}, null, { timeout: 15000 });
|
||||
await sleep(600);
|
||||
}
|
||||
|
||||
async function extractMessageDetail(page) {
|
||||
return page.evaluate(() => {
|
||||
const normalize = (value) => String(value || '').replace(/\u00a0/g, ' ').trim();
|
||||
const header = normalize(document.querySelector('.next-drawer-header')?.innerText || '');
|
||||
const body = normalize(document.querySelector('.next-drawer-body')?.innerText || '');
|
||||
const lines = body.split(/\r?\n/).map((line) => normalize(line)).filter(Boolean);
|
||||
const firstLine = lines[0] || '';
|
||||
|
||||
const extract = (label) => {
|
||||
const line = lines.find((item) => item.startsWith(`${label}:`) || item.startsWith(`${label}:`));
|
||||
if (!line) return '';
|
||||
return normalize(line.replace(`${label}:`, '').replace(`${label}:`, ''));
|
||||
};
|
||||
|
||||
const match = (pattern) => {
|
||||
const matched = body.match(pattern);
|
||||
return matched?.[1] ? normalize(matched[1]) : '';
|
||||
};
|
||||
|
||||
const accountIdMatches = Array.from(body.matchAll(/账号ID[::]?(\d{6,})/g)).map((item) => normalize(item[1])).filter(Boolean);
|
||||
|
||||
const classification = (() => {
|
||||
if (/退款/.test(header) || /退款/.test(body)) return 'refund';
|
||||
if (/释放预警/.test(header) || /预计于【.*】释放/.test(body)) return 'release_warning';
|
||||
if (/释放通知/.test(header) || /已释放/.test(body)) return 'release_notice';
|
||||
if (/未支付提醒/.test(header) || /未支付/.test(body)) return 'unpaid_reminder';
|
||||
if (/取消通知/.test(header) || /取消了一笔未支付订单/.test(body)) return 'order_cancel';
|
||||
if (/余额-预警通知/.test(header) || /账户现金余额/.test(body)) return 'balance_warning';
|
||||
if (/关联成功/.test(header) || /关联关系已完成建立/.test(body)) return 'association_success';
|
||||
if (/注册成功/.test(header) || /受邀注册UID/.test(body)) return 'registration_success';
|
||||
if (/变更已超期/.test(header) || /变更申请已超期/.test(body)) return 'change_overdue';
|
||||
return 'general';
|
||||
})();
|
||||
|
||||
const detailContent = lines.filter((line) => !/^(接收时间|客户账号|订单号|退款订单号|订单金额|退款金额|客户下单时间|退款时间|受邀注册UID)[::]/.test(line));
|
||||
|
||||
return {
|
||||
detailTitle: firstLine || header,
|
||||
detailContent: body,
|
||||
receivedAt: extract('接收时间'),
|
||||
customerName: extract('客户账号'),
|
||||
customerNo: extract('客户账号') || match(/贵司的代付(?:关联)?客户【[^/]+\/(\d{6,})】/) || match(/受邀注册UID[::]?(\d{6,})/) || accountIdMatches[0] || '',
|
||||
orderNo: extract('订单号') || extract('退款订单号'),
|
||||
orderAmount: extract('订单金额'),
|
||||
customerOrderTime: extract('客户下单时间'),
|
||||
refundOrderNo: extract('退款订单号'),
|
||||
refundAmount: extract('退款金额'),
|
||||
refundTime: extract('退款时间'),
|
||||
invitedRegisterUid: extract('受邀注册UID') || match(/受邀注册UID[::]?(\d{6,})/),
|
||||
accountIds: accountIdMatches.join(','),
|
||||
messageClassification: classification,
|
||||
status: '未读',
|
||||
title: firstLine || header,
|
||||
content: detailContent.join('\n'),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function closeMessageDetailDrawer(page) {
|
||||
const closeButton = page.locator('.next-drawer-close, .next-dialog-close, .next-icon-close').first();
|
||||
if ((await closeButton.count()) > 0) {
|
||||
await closeButton.click().catch(() => null);
|
||||
} else {
|
||||
await page.keyboard.press('Escape').catch(() => null);
|
||||
}
|
||||
await page.waitForFunction(() => !document.querySelector('.next-drawer-header'), null, { timeout: 10000 }).catch(() => null);
|
||||
await sleep(400);
|
||||
}
|
||||
|
||||
async function fetchMessageApiRows(page, pageNum, pageSize) {
|
||||
return page.evaluate(async ({ currentPage, currentPageSize }) => {
|
||||
const response = await fetch(`/api/taskapi/msgbox/queryUserMsg.json?lv2CategoryId=0&pageNo=${currentPage}&pageSize=${currentPageSize}`, {
|
||||
credentials: 'include',
|
||||
});
|
||||
const payload = await response.json();
|
||||
return Array.isArray(payload?.data?.list) ? payload.data.list : [];
|
||||
}, { currentPage: pageNum, currentPageSize: pageSize }).catch(() => []);
|
||||
}
|
||||
|
||||
function stripHtmlTags(value) {
|
||||
return String(value || '')
|
||||
.replace(/<br\s*\/?>(\r?\n)?/gi, '\n')
|
||||
.replace(/<\/div>/gi, '\n')
|
||||
.replace(/<\/p>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.replace(/ /gi, ' ')
|
||||
.replace(/\r/g, '')
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
function mapApiMessageRecord(record) {
|
||||
if (!record || typeof record !== 'object') {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
msgId: String(record.id || '').trim(),
|
||||
title: String(record.title || '').trim(),
|
||||
detailTitle: String(record.title || '').trim(),
|
||||
detailContent: stripHtmlTags(record.htmlContent || record.content || ''),
|
||||
content: stripHtmlTags(record.htmlContent || record.content || ''),
|
||||
fromApp: String(record.fromApp || '').trim(),
|
||||
bizCode: String(record.bizCode || '').trim(),
|
||||
msgChannel: String(record.msgChannel || '').trim(),
|
||||
categoryName: String(record.categoryName || '').trim(),
|
||||
categoryId: String(record.lv3CategoryId || '').trim(),
|
||||
lv1CategoryId: String(record.lv1CategoryId || '').trim(),
|
||||
lv2CategoryId: String(record.lv2CategoryId || '').trim(),
|
||||
lv3CategoryId: String(record.lv3CategoryId || '').trim(),
|
||||
gmtCreated: record.createDate ? formatDateTime(new Date(record.createDate)) : '',
|
||||
gmtModified: record.updateDate ? formatDateTime(new Date(record.updateDate)) : '',
|
||||
status: Number(record.isRead) === 1 ? '已读' : '未读',
|
||||
};
|
||||
}
|
||||
|
||||
async function enrichMessageRowsWithDetails(page, pageRows, pageNum) {
|
||||
const enrichedRows = [];
|
||||
let detailSuccess = 0;
|
||||
let detailFailed = 0;
|
||||
const apiRows = await fetchMessageApiRows(page, pageNum, datasets.messages.pageSize);
|
||||
for (let index = 0; index < pageRows.length; index += 1) {
|
||||
const row = pageRows[index];
|
||||
const rowText = String(row['消息标题'] || row['标题'] || row.title || row.column_1 || '').trim();
|
||||
if (!rowText) {
|
||||
enrichedRows.push(row);
|
||||
continue;
|
||||
}
|
||||
|
||||
const apiDetail = mapApiMessageRecord(apiRows[index]);
|
||||
if (apiDetail?.msgId) {
|
||||
detailSuccess += 1;
|
||||
enrichedRows.push({ ...row, ...apiDetail });
|
||||
continue;
|
||||
}
|
||||
|
||||
const clicked = await clickMessageDetailButton(page, rowText, index);
|
||||
if (!clicked.clicked) {
|
||||
detailFailed += 1;
|
||||
console.warn(`[消息详情] 打开失败: pageRow=${index + 1}, title="${rowText}", reason=${clicked.reason || 'unknown'}`);
|
||||
enrichedRows.push(row);
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
await waitForMessageDetailDrawer(page);
|
||||
const detail = await extractMessageDetail(page);
|
||||
detailSuccess += 1;
|
||||
enrichedRows.push({ ...row, ...detail });
|
||||
} catch (error) {
|
||||
detailFailed += 1;
|
||||
console.warn(`[消息详情] 提取失败: pageRow=${index + 1}, title="${rowText}", error=${error.message}`);
|
||||
enrichedRows.push(row);
|
||||
} finally {
|
||||
await closeMessageDetailDrawer(page);
|
||||
await waitForTableRows(page).catch(() => null);
|
||||
}
|
||||
}
|
||||
console.log(`[消息详情] 本页详情提取: success=${detailSuccess}, failed=${detailFailed}, total=${pageRows.length}`);
|
||||
return enrichedRows;
|
||||
}
|
||||
|
||||
async function restoreOrderWindow(page, windowStart, windowEnd) {
|
||||
await waitUntilReady(page, datasets.orders.heading).catch(() => null);
|
||||
await setDateRange(page, windowStart, windowEnd);
|
||||
@@ -2587,6 +3068,9 @@ async function extractCustomerDetail(page) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!department) {
|
||||
department = extract('所属部门');
|
||||
}
|
||||
|
||||
return {
|
||||
customerAccount: extract('客户账号'),
|
||||
@@ -2595,10 +3079,10 @@ async function extractCustomerDetail(page) {
|
||||
tradeMode: extract('交易模式'),
|
||||
customerSource: extract('客户来源'),
|
||||
realNameStatus: extract('实名状态'),
|
||||
email: extract('邮箱'),
|
||||
email: extract('邮箱') || extract('Email') || extract('电子邮箱'),
|
||||
relationDate: extract('关联日期'),
|
||||
phone: extract('手机号'),
|
||||
remark: extract('备注'),
|
||||
phone: extract('手机号') || extract('手机') || extract('联系电话') || extract('联系手机'),
|
||||
remark: extract('备注') || extract('客户备注'),
|
||||
paymentNoticeStatus: extract('代为支付告知状态'),
|
||||
department,
|
||||
lastMonthPayableTotalCny: extractAmountFromSection(lastMonthSection, '上月应付总金额(CNY)'),
|
||||
|
||||
Reference in New Issue
Block a user