断点爬取
This commit is contained in:
@@ -160,7 +160,7 @@ export const datasets = {
|
|||||||
name: 'bills',
|
name: 'bills',
|
||||||
url: `${config.baseUrl}/#/detail/bill/~/costCenter/bill`,
|
url: `${config.baseUrl}/#/detail/bill/~/costCenter/bill`,
|
||||||
heading: '账单查询',
|
heading: '账单查询',
|
||||||
pageSize: 100,
|
pageSize: 20,
|
||||||
uniqueKey: (record) => record.__hash,
|
uniqueKey: (record) => record.__hash,
|
||||||
normalize: (record, context) => ({
|
normalize: (record, context) => ({
|
||||||
billingMonth: record['账期'] || '',
|
billingMonth: record['账期'] || '',
|
||||||
|
|||||||
@@ -43,6 +43,10 @@ export function saveRunSummary(stamp, summary) {
|
|||||||
writeJson(path.join(config.dataDir, 'runs', `${stamp}.json`), summary);
|
writeJson(path.join(config.dataDir, 'runs', `${stamp}.json`), summary);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function saveCheckpoint(dataset, name, payload) {
|
||||||
|
writeJson(path.join(config.dataDir, 'checkpoints', dataset, `${name}.json`), payload);
|
||||||
|
}
|
||||||
|
|
||||||
export function diffRecords(previousState, records, uniqueKey) {
|
export function diffRecords(previousState, records, uniqueKey) {
|
||||||
const previousIndex = previousState.index || {};
|
const previousIndex = previousState.index || {};
|
||||||
const nextIndex = {};
|
const nextIndex = {};
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import {
|
|||||||
diffRecords,
|
diffRecords,
|
||||||
loadCurrentState,
|
loadCurrentState,
|
||||||
nowStamp,
|
nowStamp,
|
||||||
|
saveCheckpoint,
|
||||||
saveDatasetRun,
|
saveDatasetRun,
|
||||||
saveDelta,
|
saveDelta,
|
||||||
saveRunSummary,
|
saveRunSummary,
|
||||||
@@ -450,7 +451,17 @@ async function syncBills(page) {
|
|||||||
await setMonthValue(page, month);
|
await setMonthValue(page, month);
|
||||||
await clickQuery(page);
|
await clickQuery(page);
|
||||||
await trySetPageSize(page, dataset.pageSize);
|
await trySetPageSize(page, dataset.pageSize);
|
||||||
let records = await scrapePagedTable(page, dataset, { month });
|
const monthRecords = [];
|
||||||
|
let records = await scrapePagedTable(page, dataset, { month }, {
|
||||||
|
onPage: async ({ pageData, pageNum }) => {
|
||||||
|
monthRecords.push(...pageData.rows.map((row) => ({ ...row, __context: { month } })));
|
||||||
|
let checkpointRecords = monthRecords;
|
||||||
|
if (latestConsumptionDate) {
|
||||||
|
checkpointRecords = monthRecords.filter((record) => isAfterLatestConsumptionDate(record, latestConsumptionDate));
|
||||||
|
}
|
||||||
|
await saveBillsCheckpoint(dataset, month, pageNum, checkpointRecords);
|
||||||
|
},
|
||||||
|
});
|
||||||
if (latestConsumptionDate) {
|
if (latestConsumptionDate) {
|
||||||
const before = records.length;
|
const before = records.length;
|
||||||
records = records.filter((record) => isAfterLatestConsumptionDate(record, latestConsumptionDate));
|
records = records.filter((record) => isAfterLatestConsumptionDate(record, latestConsumptionDate));
|
||||||
@@ -462,6 +473,21 @@ async function syncBills(page) {
|
|||||||
return persistDataset(dataset, dedupeByHash(allRecords), {});
|
return persistDataset(dataset, dedupeByHash(allRecords), {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function saveBillsCheckpoint(dataset, month, pageNum, rawRecords) {
|
||||||
|
const normalized = dedupeByHash(rawRecords.map((record) => dataset.normalize(record, record.__context || {})).map(withHash));
|
||||||
|
const checkpointName = `${month}-latest`;
|
||||||
|
saveCheckpoint(dataset.name, checkpointName, {
|
||||||
|
month,
|
||||||
|
pageNum,
|
||||||
|
savedAt: new Date().toISOString(),
|
||||||
|
stats: {
|
||||||
|
total: normalized.length,
|
||||||
|
},
|
||||||
|
records: normalized,
|
||||||
|
});
|
||||||
|
console.log(`[账单检查点] 已落盘: month=${month}, page=${pageNum}, records=${normalized.length}`);
|
||||||
|
}
|
||||||
|
|
||||||
function getLatestBillConsumptionDate() {
|
function getLatestBillConsumptionDate() {
|
||||||
const scriptPath = path.resolve(config.rootDir, config.dbSyncScript);
|
const scriptPath = path.resolve(config.rootDir, config.dbSyncScript);
|
||||||
try {
|
try {
|
||||||
@@ -602,7 +628,8 @@ async function waitUntilReady(page, heading, timeout = 120000, options = {}) {
|
|||||||
await sleep(1500);
|
await sleep(1500);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapePagedTable(page, dataset, context) {
|
async function scrapePagedTable(page, dataset, context, options = {}) {
|
||||||
|
const { onPage } = options;
|
||||||
const pages = [];
|
const pages = [];
|
||||||
const visited = new Set();
|
const visited = new Set();
|
||||||
|
|
||||||
@@ -618,7 +645,11 @@ async function scrapePagedTable(page, dataset, context) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
visited.add(pageKey);
|
visited.add(pageKey);
|
||||||
pages.push(...pageData.rows.map((row) => ({ ...row, __context: context })));
|
const pageRows = pageData.rows.map((row) => ({ ...row, __context: context }));
|
||||||
|
pages.push(...pageRows);
|
||||||
|
if (onPage) {
|
||||||
|
await onPage({ pageData, pageNum, pageRows });
|
||||||
|
}
|
||||||
|
|
||||||
const moved = await gotoNextPage(page);
|
const moved = await gotoNextPage(page);
|
||||||
if (!moved) {
|
if (!moved) {
|
||||||
@@ -631,6 +662,22 @@ async function scrapePagedTable(page, dataset, context) {
|
|||||||
return pages;
|
return pages;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function raiseIfSessionExpired(page, label) {
|
||||||
|
const { currentUrl, bodyText, isAuthPage } = await detectAuthRedirect(page);
|
||||||
|
if (!isAuthPage) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`[鉴权] ${label} 时检测到登录页/鉴权页: ${currentUrl}`);
|
||||||
|
console.error(`[鉴权] 页面内容前500字: ${bodyText}`);
|
||||||
|
try {
|
||||||
|
await sendLoginAlert(currentUrl);
|
||||||
|
} catch (notifyErr) {
|
||||||
|
console.error('[通知] 发送登录提醒失败:', notifyErr.message);
|
||||||
|
}
|
||||||
|
throw new Error(`运行过程中登录态失效(${label})。请重新执行 npm run login 后再继续同步。`);
|
||||||
|
}
|
||||||
|
|
||||||
async function extractTable(page) {
|
async function extractTable(page) {
|
||||||
return page.evaluate(() => {
|
return page.evaluate(() => {
|
||||||
const normalize = (value) =>
|
const normalize = (value) =>
|
||||||
@@ -670,7 +717,12 @@ async function extractTable(page) {
|
|||||||
|
|
||||||
async function waitForTableRows(page) {
|
async function waitForTableRows(page) {
|
||||||
await runtimeCheckpoint('等待表格加载');
|
await runtimeCheckpoint('等待表格加载');
|
||||||
await page.waitForFunction(() => document.querySelectorAll('table tbody tr').length > 0, null, { timeout: 120000 });
|
try {
|
||||||
|
await page.waitForFunction(() => document.querySelectorAll('table tbody tr').length > 0, null, { timeout: 120000 });
|
||||||
|
} catch (error) {
|
||||||
|
await raiseIfSessionExpired(page, '等待表格加载');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
await sleep(800);
|
await sleep(800);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -700,9 +752,19 @@ async function gotoNextPage(page) {
|
|||||||
// 用 Playwright click(而非 DOM click),确保 React 事件正常触发
|
// 用 Playwright click(而非 DOM click),确保 React 事件正常触发
|
||||||
await nextBtn.click();
|
await nextBtn.click();
|
||||||
await sleep(2000);
|
await sleep(2000);
|
||||||
|
await raiseIfSessionExpired(page, `翻页 ${before} -> next`);
|
||||||
|
|
||||||
const after = await currentPageNumber(page);
|
const after = await currentPageNumber(page);
|
||||||
console.log(`[翻页] ${before} -> ${after}`);
|
console.log(`[翻页] ${before} -> ${after}`);
|
||||||
|
|
||||||
|
if (before > 1 && after === 1) {
|
||||||
|
throw new Error(`分页从第 ${before} 页异常回退到第 1 页,疑似登录态失效或页面会话已重置。请重新执行 npm run login 后再继续同步。`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (after < before) {
|
||||||
|
throw new Error(`分页从第 ${before} 页异常回退到第 ${after} 页,疑似登录态失效或页面状态被重置。请重新执行 npm run login 后再继续同步。`);
|
||||||
|
}
|
||||||
|
|
||||||
return before !== after;
|
return before !== after;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user