|
|
|
|
@@ -239,6 +239,37 @@ function loadLatestBillsCheckpoint() {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function loadLatestOrdersCheckpoint() {
|
|
|
|
|
const checkpointDir = path.join(config.dataDir, 'checkpoints', 'orders');
|
|
|
|
|
if (!fs.existsSync(checkpointDir)) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const candidates = fs.readdirSync(checkpointDir)
|
|
|
|
|
.filter((fileName) => fileName.endsWith('.json'))
|
|
|
|
|
.map((fileName) => {
|
|
|
|
|
const filePath = path.join(checkpointDir, fileName);
|
|
|
|
|
const stat = fs.statSync(filePath);
|
|
|
|
|
return { fileName, filePath, mtimeMs: stat.mtimeMs };
|
|
|
|
|
})
|
|
|
|
|
.sort((a, b) => b.mtimeMs - a.mtimeMs);
|
|
|
|
|
|
|
|
|
|
if (candidates.length === 0) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const latest = JSON.parse(fs.readFileSync(candidates[0].filePath, 'utf-8'));
|
|
|
|
|
if (!latest || typeof latest !== 'object') {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
return latest;
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.warn(`[订单检查点] 读取失败,忽略断点续爬: ${error.message}`);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function subtractDays(dateValue, days) {
|
|
|
|
|
const next = new Date(dateValue);
|
|
|
|
|
next.setDate(next.getDate() - days);
|
|
|
|
|
@@ -393,11 +424,12 @@ export async function login() {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export async function syncAll() {
|
|
|
|
|
export async function syncAll(options = {}) {
|
|
|
|
|
const runtimeController = getRuntimeController();
|
|
|
|
|
runtimeController.bind();
|
|
|
|
|
const context = await getContext();
|
|
|
|
|
let page = null;
|
|
|
|
|
const { resume = false } = options;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const summary = { startedAt: new Date().toISOString(), datasets: {} };
|
|
|
|
|
@@ -408,14 +440,14 @@ export async function syncAll() {
|
|
|
|
|
summary.datasets.customerDetails = await syncCustomerDetails(page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
summary.datasets.orders = await syncOrders(page, { incremental: !config.fullSync });
|
|
|
|
|
summary.datasets.orders = await syncOrders(page, { incremental: !config.fullSync, resume });
|
|
|
|
|
|
|
|
|
|
// syncOrders 完成后,从最新的 orders.json 读取 orderId 列表
|
|
|
|
|
const latestOrders = loadCurrentState('orders', datasets.orders.uniqueKey);
|
|
|
|
|
const orderIdsForDetail = collectValidOrderIds(latestOrders.records || []);
|
|
|
|
|
|
|
|
|
|
summary.datasets.orderDetails = await syncOrderDetails(page, orderIdsForDetail);
|
|
|
|
|
summary.datasets.bills = await syncBills(page, { incremental: !config.fullSync });
|
|
|
|
|
summary.datasets.bills = await syncBills(page, { incremental: !config.fullSync, resume });
|
|
|
|
|
summary.datasets.messages = await syncMessages(page, { incremental: !config.fullSync });
|
|
|
|
|
summary.finishedAt = new Date().toISOString();
|
|
|
|
|
|
|
|
|
|
@@ -614,7 +646,7 @@ async function syncCustomerDetails(page) {
|
|
|
|
|
async function syncOrders(page, options = {}) {
|
|
|
|
|
await runtimeCheckpoint('同步订单');
|
|
|
|
|
const dataset = datasets.orders;
|
|
|
|
|
const { incremental = false } = options;
|
|
|
|
|
const { incremental = false, resume = false } = options;
|
|
|
|
|
let windows;
|
|
|
|
|
|
|
|
|
|
if (!incremental) {
|
|
|
|
|
@@ -623,7 +655,16 @@ async function syncOrders(page, options = {}) {
|
|
|
|
|
windows = await buildIncrementalOrderWindows();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const allRecords = [];
|
|
|
|
|
const resumeCheckpoint = resume ? loadLatestOrdersCheckpoint() : null;
|
|
|
|
|
if (resumeCheckpoint?.windowStart) {
|
|
|
|
|
const resumeIndex = windows.findIndex((window) => window.start === resumeCheckpoint.windowStart && window.end === resumeCheckpoint.windowEnd);
|
|
|
|
|
if (resumeIndex >= 0) {
|
|
|
|
|
windows = windows.slice(resumeIndex);
|
|
|
|
|
console.log(`[订单续爬] 从 checkpoint 恢复: ${resumeCheckpoint.windowStart} ~ ${resumeCheckpoint.windowEnd}, page=${resumeCheckpoint.pageNum || 1}, records=${(resumeCheckpoint.records || []).length}`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const allNormalizedRecords = [];
|
|
|
|
|
|
|
|
|
|
for (const window of windows) {
|
|
|
|
|
await runtimeCheckpoint(`订单窗口 ${window.start} ~ ${window.end}`);
|
|
|
|
|
@@ -632,15 +673,46 @@ async function syncOrders(page, options = {}) {
|
|
|
|
|
await setDateRange(page, window.start, window.end);
|
|
|
|
|
await clickQuery(page);
|
|
|
|
|
await trySetPageSize(page, dataset.pageSize);
|
|
|
|
|
const records = await scrapePagedTable(page, dataset, window);
|
|
|
|
|
allRecords.push(...records);
|
|
|
|
|
if (hasDbConfig()) {
|
|
|
|
|
const normalizedWindowRecords = dedupeByHash(normalizeDatasetRecords(dataset, records, window));
|
|
|
|
|
await upsertOrders(normalizedWindowRecords);
|
|
|
|
|
let windowNormalizedRecords = [];
|
|
|
|
|
let resumeFromPage = 0;
|
|
|
|
|
let shouldContinueScrape = true;
|
|
|
|
|
if (resumeCheckpoint?.windowStart === window.start && resumeCheckpoint?.windowEnd === window.end) {
|
|
|
|
|
windowNormalizedRecords = Array.isArray(resumeCheckpoint.records) ? resumeCheckpoint.records : [];
|
|
|
|
|
resumeFromPage = Number.parseInt(String(resumeCheckpoint.pageNum || 0), 10) || 0;
|
|
|
|
|
if (resumeFromPage > 0) {
|
|
|
|
|
const moved = await moveOrdersToResumeStart(page, resumeFromPage);
|
|
|
|
|
if (!moved) {
|
|
|
|
|
console.log(`[订单续爬] checkpoint 已在最后一页,无需继续抓取 window=${window.start}~${window.end}`);
|
|
|
|
|
shouldContinueScrape = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let records = [];
|
|
|
|
|
if (shouldContinueScrape) {
|
|
|
|
|
records = await scrapePagedTable(page, dataset, window, {
|
|
|
|
|
onPage: async ({ pageNum, pageRows }) => {
|
|
|
|
|
const normalizedPageRows = normalizeDatasetRecords(dataset, pageRows, window);
|
|
|
|
|
windowNormalizedRecords.push(...normalizedPageRows);
|
|
|
|
|
if (hasDbConfig()) {
|
|
|
|
|
await upsertOrders(normalizedPageRows);
|
|
|
|
|
}
|
|
|
|
|
await saveOrdersCheckpoint(dataset, window, pageNum, windowNormalizedRecords);
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (resumeFromPage === 0) {
|
|
|
|
|
windowNormalizedRecords = normalizeDatasetRecords(dataset, records, window);
|
|
|
|
|
if (hasDbConfig()) {
|
|
|
|
|
await upsertOrders(dedupeByHash(windowNormalizedRecords));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
allNormalizedRecords.push(...windowNormalizedRecords);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return persistDataset(dataset, dedupeByHash(allRecords), {});
|
|
|
|
|
return persistNormalizedDataset(dataset, dedupeByHash(allNormalizedRecords));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function buildIncrementalOrderWindows() {
|
|
|
|
|
@@ -828,6 +900,22 @@ async function saveBillsCheckpoint(dataset, month, pageNum, normalizedRecords) {
|
|
|
|
|
console.log(`[账单检查点] 已落盘: month=${month}, page=${pageNum}, records=${normalized.length}`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function saveOrdersCheckpoint(dataset, window, pageNum, normalizedRecords) {
|
|
|
|
|
const normalized = dedupeByHash(normalizedRecords);
|
|
|
|
|
const checkpointName = `${window.start}_${window.end}`.replace(/[^0-9_-]/g, '-');
|
|
|
|
|
saveCheckpoint(dataset.name, checkpointName, {
|
|
|
|
|
windowStart: window.start,
|
|
|
|
|
windowEnd: window.end,
|
|
|
|
|
pageNum,
|
|
|
|
|
savedAt: new Date().toISOString(),
|
|
|
|
|
stats: {
|
|
|
|
|
total: normalized.length,
|
|
|
|
|
},
|
|
|
|
|
records: normalized,
|
|
|
|
|
});
|
|
|
|
|
console.log(`[订单检查点] 已落盘: ${window.start} ~ ${window.end}, page=${pageNum}, records=${normalized.length}`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalizeDatasetRecords(dataset, records, context) {
|
|
|
|
|
return records.map((record) => withHash(dataset.normalize(record, record.__context || context)));
|
|
|
|
|
}
|
|
|
|
|
@@ -846,6 +934,20 @@ async function moveBillsToResumeStart(page, resumeFromPage) {
|
|
|
|
|
return moved;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function moveOrdersToResumeStart(page, resumeFromPage) {
|
|
|
|
|
if (resumeFromPage <= 0) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const reached = await jumpToPage(page, resumeFromPage);
|
|
|
|
|
if (!reached) {
|
|
|
|
|
throw new Error(`订单续爬失败:无法定位到 checkpoint 页码 ${resumeFromPage}`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const moved = await gotoNextPage(page);
|
|
|
|
|
return moved;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function getLatestBillConsumptionDate() {
|
|
|
|
|
if (!hasDbConfig()) {
|
|
|
|
|
console.warn('[增量模式] 未配置数据库连接,无法读取账单水位,回退到当前日期');
|
|
|
|
|
|