订单详情调整抓取方式

This commit is contained in:
ray
2026-05-07 15:01:00 +08:00
parent a06cdc70f1
commit 3a7f91419e
2 changed files with 130 additions and 16 deletions

View File

@@ -107,6 +107,7 @@ export const datasets = {
uniqueKey: (record) => record.orderId || record.__hash,
normalize: (record, context) => ({
orderId: record['订单号'] || '',
listPageNum: context.pageNum || '',
customerAccount: (record['客户账号'] || '').replace(/\s+/g, ''),
customerCategory: record['客户分类'] || '',
orderType: record['订单类型'] || '',

View File

@@ -1337,31 +1337,45 @@ async function syncOrderDetails(page, cachedOrderIds, options = {}) {
const dataset = datasets.orderDetails;
const resumeCheckpoint = options.resume ? loadLatestOrderDetailsCheckpoint() : null;
// 使用传入的 orderId 列表(在 syncOrders 覆盖 orders.json 之前缓存的)
const allOrderIds = cachedOrderIds || [];
const ordersState = loadCurrentState('orders', datasets.orders.uniqueKey);
const orderTargets = collectOrderDetailTargets(ordersState.records || [], cachedOrderIds || []);
if (allOrderIds.length === 0) {
console.log('[订单详情] 本地无订单数据,跳过');
if (orderTargets.length === 0) {
console.log('[订单详情] 本地无订单定位数据,跳过');
return persistDataset(dataset, [], {});
}
console.log(`[订单详情] 共 ${allOrderIds.length} 个订单需要获取详情`);
console.log(`[订单详情] 共 ${orderTargets.length} 个订单需要获取详情`);
const allDetails = Array.isArray(resumeCheckpoint?.records) ? resumeCheckpoint.records : [];
const startIndex = Number.parseInt(String(resumeCheckpoint?.currentIndex || 0), 10) || 0;
if (startIndex > 0) {
console.log(`[订单详情续爬] 从 checkpoint 恢复: index=${startIndex}, records=${allDetails.length}`);
}
const detailBaseUrl = 'https://aps.aliyun.com/?spm=5176.12818093.top-nav.ditem-fx.785716d0LKDpKT#/detail/order/~/costCenter/order/detail/';
let currentListPage = 0;
for (let index = startIndex; index < allOrderIds.length; index += 1) {
await runtimeCheckpoint(`订单详情 ${index + 1}/${allOrderIds.length}`);
const orderId = allOrderIds[index];
console.log(`[订单详情] ${index + 1}/${allOrderIds.length} orderId=${orderId}`);
await page.goto(datasets.orders.url, { waitUntil: 'domcontentloaded' });
await waitUntilReady(page, datasets.orders.heading);
await trySetPageSize(page, datasets.orders.pageSize);
// 先跳 about:blank 再跳详情URL强制 SPA 完整重新加载)
await page.goto('about:blank');
await sleep(300);
await page.goto(`${detailBaseUrl}${orderId}?projectId=`, { waitUntil: 'domcontentloaded' });
for (let index = startIndex; index < orderTargets.length; index += 1) {
await runtimeCheckpoint(`订单详情 ${index + 1}/${orderTargets.length}`);
const target = orderTargets[index];
console.log(`[订单详情] ${index + 1}/${orderTargets.length} orderId=${target.orderId} page=${target.pageNum}`);
if (target.pageNum > 0 && currentListPage !== target.pageNum) {
const reached = await jumpToOrderPage(page, target.pageNum);
if (!reached) {
console.warn(`[订单详情] 无法跳到第 ${target.pageNum} 页,跳过 ${target.orderId}`);
continue;
}
currentListPage = target.pageNum;
}
const clicked = await clickOrderDetailFromListWithRetry(page, target);
if (!clicked) {
console.warn(`[订单详情] 列表中未找到 orderId=${target.orderId},跳过`);
continue;
}
try {
await page.waitForFunction(
@@ -1371,13 +1385,15 @@ async function syncOrderDetails(page, cachedOrderIds, options = {}) {
);
await sleep(1000);
} catch {
console.warn(`[订单详情] ${orderId} 详情页加载超时,跳过`);
console.warn(`[订单详情] ${target.orderId} 详情页加载超时,跳过`);
await page.goBack({ waitUntil: 'domcontentloaded' }).catch(() => null);
await recoverOrderListState(page, currentListPage).catch(() => null);
continue;
}
const detail = await extractOrderDetail(page);
if (!isValidOrderId(detail.orderId)) {
detail.orderId = orderId;
detail.orderId = target.orderId;
}
allDetails.push({ ...detail, __context: {} });
await saveOrderDetailsCheckpoint(dataset, index + 1, allDetails);
@@ -1385,6 +1401,10 @@ async function syncOrderDetails(page, cachedOrderIds, options = {}) {
const normalizedDetail = normalizeDatasetRecords(dataset, [{ ...detail, __context: {} }], {});
await upsertOrderDetails(normalizedDetail);
}
await page.goBack({ waitUntil: 'domcontentloaded' }).catch(() => null);
await recoverOrderListState(page, currentListPage).catch(() => null);
currentListPage = target.pageNum;
}
return persistDataset(dataset, dedupeByHash(allDetails), {});
@@ -1958,6 +1978,28 @@ function collectCustomerDetailTargets(records) {
return targets.sort((a, b) => a.pageNum - b.pageNum);
}
function collectOrderDetailTargets(records, cachedOrderIds = []) {
const allowSet = new Set((cachedOrderIds || []).map((value) => String(value || '').trim()).filter(Boolean));
const targets = [];
const seen = new Set();
for (const record of records) {
const orderId = String(record.orderId || '').trim();
const pageNum = Number.parseInt(String(record.listPageNum || 0), 10) || 0;
if (!orderId || !isValidOrderId(orderId) || pageNum <= 0) {
continue;
}
if (allowSet.size > 0 && !allowSet.has(orderId)) {
continue;
}
if (seen.has(orderId)) {
continue;
}
seen.add(orderId);
targets.push({ orderId, pageNum });
}
return targets.sort((a, b) => a.pageNum - b.pageNum);
}
async function clickCustomerDetailFromList(page, target) {
const clicked = await page.evaluate(({ accountId, loginName }) => {
const normalize = (value) => String(value || '').replace(/\s+/g, '').trim();
@@ -2006,6 +2048,54 @@ async function clickCustomerDetailFromListWithRetry(page, target) {
return false;
}
async function clickOrderDetailFromList(page, target) {
const clicked = await page.evaluate(({ orderId }) => {
const normalize = (value) => String(value || '').replace(/\s+/g, '').trim();
const rows = Array.from(document.querySelectorAll('table tbody tr'));
const targetRow = rows.find((row) => {
const text = normalize(row.innerText || row.textContent || '');
return text.includes(orderId);
});
if (!targetRow) {
return false;
}
const detailButton = Array.from(targetRow.querySelectorAll('button, a, span'))
.find((node) => /详情/.test(String(node.textContent || '').trim()));
if (!detailButton) {
return false;
}
detailButton.scrollIntoView({ block: 'center', inline: 'center', behavior: 'instant' });
detailButton.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
return true;
}, target).catch(() => false);
if (clicked) {
await sleep(1200);
}
return clicked;
}
async function clickOrderDetailFromListWithRetry(page, target) {
const attempts = [target.pageNum, Math.max(1, target.pageNum - 1), target.pageNum + 1];
for (const pageNum of attempts) {
if (pageNum > 0 && pageNum !== target.pageNum) {
const reached = await jumpToOrderPage(page, pageNum);
if (!reached) {
continue;
}
await waitForStableOrderList(page);
}
const clicked = await clickOrderDetailFromList(page, target);
if (clicked) {
return true;
}
}
return false;
}
async function jumpToCustomerPage(page, pageNum) {
const reached = await jumpToPage(page, pageNum);
if (reached) {
@@ -2014,12 +2104,26 @@ async function jumpToCustomerPage(page, pageNum) {
return reached;
}
async function jumpToOrderPage(page, pageNum) {
const reached = await jumpToPage(page, pageNum);
if (reached) {
console.log(`[订单详情] 已跳转到第 ${pageNum}`);
}
return reached;
}
async function waitForStableCustomerList(page) {
await waitForTableRows(page).catch(() => null);
await sleep(600);
await waitForTableRows(page).catch(() => null);
}
async function waitForStableOrderList(page) {
await waitForTableRows(page).catch(() => null);
await sleep(600);
await waitForTableRows(page).catch(() => null);
}
async function recoverCustomerListState(page, pageNum) {
await waitUntilReady(page, datasets.customers.heading).catch(() => null);
await trySetPageSize(page, datasets.customers.pageSize).catch(() => null);
@@ -2029,6 +2133,15 @@ async function recoverCustomerListState(page, pageNum) {
}
}
async function recoverOrderListState(page, pageNum) {
await waitUntilReady(page, datasets.orders.heading).catch(() => null);
await trySetPageSize(page, datasets.orders.pageSize).catch(() => null);
if (pageNum > 0) {
await jumpToOrderPage(page, pageNum).catch(() => null);
await waitForStableOrderList(page).catch(() => null);
}
}
function isValidOrderId(orderId) {
const value = String(orderId || '').trim();
if (!value) return false;