订单详情调整抓取方式
This commit is contained in:
@@ -107,6 +107,7 @@ export const datasets = {
|
|||||||
uniqueKey: (record) => record.orderId || record.__hash,
|
uniqueKey: (record) => record.orderId || record.__hash,
|
||||||
normalize: (record, context) => ({
|
normalize: (record, context) => ({
|
||||||
orderId: record['订单号'] || '',
|
orderId: record['订单号'] || '',
|
||||||
|
listPageNum: context.pageNum || '',
|
||||||
customerAccount: (record['客户账号'] || '').replace(/\s+/g, ''),
|
customerAccount: (record['客户账号'] || '').replace(/\s+/g, ''),
|
||||||
customerCategory: record['客户分类'] || '',
|
customerCategory: record['客户分类'] || '',
|
||||||
orderType: record['订单类型'] || '',
|
orderType: record['订单类型'] || '',
|
||||||
|
|||||||
@@ -1337,31 +1337,45 @@ async function syncOrderDetails(page, cachedOrderIds, options = {}) {
|
|||||||
const dataset = datasets.orderDetails;
|
const dataset = datasets.orderDetails;
|
||||||
const resumeCheckpoint = options.resume ? loadLatestOrderDetailsCheckpoint() : null;
|
const resumeCheckpoint = options.resume ? loadLatestOrderDetailsCheckpoint() : null;
|
||||||
|
|
||||||
// 使用传入的 orderId 列表(在 syncOrders 覆盖 orders.json 之前缓存的)
|
const ordersState = loadCurrentState('orders', datasets.orders.uniqueKey);
|
||||||
const allOrderIds = cachedOrderIds || [];
|
const orderTargets = collectOrderDetailTargets(ordersState.records || [], cachedOrderIds || []);
|
||||||
|
|
||||||
if (allOrderIds.length === 0) {
|
if (orderTargets.length === 0) {
|
||||||
console.log('[订单详情] 本地无订单数据,跳过');
|
console.log('[订单详情] 本地无订单定位数据,跳过');
|
||||||
return persistDataset(dataset, [], {});
|
return persistDataset(dataset, [], {});
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`[订单详情] 共 ${allOrderIds.length} 个订单需要获取详情`);
|
console.log(`[订单详情] 共 ${orderTargets.length} 个订单需要获取详情`);
|
||||||
const allDetails = Array.isArray(resumeCheckpoint?.records) ? resumeCheckpoint.records : [];
|
const allDetails = Array.isArray(resumeCheckpoint?.records) ? resumeCheckpoint.records : [];
|
||||||
const startIndex = Number.parseInt(String(resumeCheckpoint?.currentIndex || 0), 10) || 0;
|
const startIndex = Number.parseInt(String(resumeCheckpoint?.currentIndex || 0), 10) || 0;
|
||||||
if (startIndex > 0) {
|
if (startIndex > 0) {
|
||||||
console.log(`[订单详情续爬] 从 checkpoint 恢复: index=${startIndex}, records=${allDetails.length}`);
|
console.log(`[订单详情续爬] 从 checkpoint 恢复: index=${startIndex}, records=${allDetails.length}`);
|
||||||
}
|
}
|
||||||
const detailBaseUrl = 'https://aps.aliyun.com/?spm=5176.12818093.top-nav.ditem-fx.785716d0LKDpKT#/detail/order/~/costCenter/order/detail/';
|
let currentListPage = 0;
|
||||||
|
|
||||||
for (let index = startIndex; index < allOrderIds.length; index += 1) {
|
await page.goto(datasets.orders.url, { waitUntil: 'domcontentloaded' });
|
||||||
await runtimeCheckpoint(`订单详情 ${index + 1}/${allOrderIds.length}`);
|
await waitUntilReady(page, datasets.orders.heading);
|
||||||
const orderId = allOrderIds[index];
|
await trySetPageSize(page, datasets.orders.pageSize);
|
||||||
console.log(`[订单详情] ${index + 1}/${allOrderIds.length} orderId=${orderId}`);
|
|
||||||
|
|
||||||
// 先跳 about:blank 再跳详情URL(强制 SPA 完整重新加载)
|
for (let index = startIndex; index < orderTargets.length; index += 1) {
|
||||||
await page.goto('about:blank');
|
await runtimeCheckpoint(`订单详情 ${index + 1}/${orderTargets.length}`);
|
||||||
await sleep(300);
|
const target = orderTargets[index];
|
||||||
await page.goto(`${detailBaseUrl}${orderId}?projectId=`, { waitUntil: 'domcontentloaded' });
|
console.log(`[订单详情] ${index + 1}/${orderTargets.length} orderId=${target.orderId} page=${target.pageNum}`);
|
||||||
|
|
||||||
|
if (target.pageNum > 0 && currentListPage !== target.pageNum) {
|
||||||
|
const reached = await jumpToOrderPage(page, target.pageNum);
|
||||||
|
if (!reached) {
|
||||||
|
console.warn(`[订单详情] 无法跳到第 ${target.pageNum} 页,跳过 ${target.orderId}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
currentListPage = target.pageNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
const clicked = await clickOrderDetailFromListWithRetry(page, target);
|
||||||
|
if (!clicked) {
|
||||||
|
console.warn(`[订单详情] 列表中未找到 orderId=${target.orderId},跳过`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await page.waitForFunction(
|
await page.waitForFunction(
|
||||||
@@ -1371,13 +1385,15 @@ async function syncOrderDetails(page, cachedOrderIds, options = {}) {
|
|||||||
);
|
);
|
||||||
await sleep(1000);
|
await sleep(1000);
|
||||||
} catch {
|
} catch {
|
||||||
console.warn(`[订单详情] ${orderId} 详情页加载超时,跳过`);
|
console.warn(`[订单详情] ${target.orderId} 详情页加载超时,跳过`);
|
||||||
|
await page.goBack({ waitUntil: 'domcontentloaded' }).catch(() => null);
|
||||||
|
await recoverOrderListState(page, currentListPage).catch(() => null);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const detail = await extractOrderDetail(page);
|
const detail = await extractOrderDetail(page);
|
||||||
if (!isValidOrderId(detail.orderId)) {
|
if (!isValidOrderId(detail.orderId)) {
|
||||||
detail.orderId = orderId;
|
detail.orderId = target.orderId;
|
||||||
}
|
}
|
||||||
allDetails.push({ ...detail, __context: {} });
|
allDetails.push({ ...detail, __context: {} });
|
||||||
await saveOrderDetailsCheckpoint(dataset, index + 1, allDetails);
|
await saveOrderDetailsCheckpoint(dataset, index + 1, allDetails);
|
||||||
@@ -1385,6 +1401,10 @@ async function syncOrderDetails(page, cachedOrderIds, options = {}) {
|
|||||||
const normalizedDetail = normalizeDatasetRecords(dataset, [{ ...detail, __context: {} }], {});
|
const normalizedDetail = normalizeDatasetRecords(dataset, [{ ...detail, __context: {} }], {});
|
||||||
await upsertOrderDetails(normalizedDetail);
|
await upsertOrderDetails(normalizedDetail);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await page.goBack({ waitUntil: 'domcontentloaded' }).catch(() => null);
|
||||||
|
await recoverOrderListState(page, currentListPage).catch(() => null);
|
||||||
|
currentListPage = target.pageNum;
|
||||||
}
|
}
|
||||||
|
|
||||||
return persistDataset(dataset, dedupeByHash(allDetails), {});
|
return persistDataset(dataset, dedupeByHash(allDetails), {});
|
||||||
@@ -1958,6 +1978,28 @@ function collectCustomerDetailTargets(records) {
|
|||||||
return targets.sort((a, b) => a.pageNum - b.pageNum);
|
return targets.sort((a, b) => a.pageNum - b.pageNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function collectOrderDetailTargets(records, cachedOrderIds = []) {
|
||||||
|
const allowSet = new Set((cachedOrderIds || []).map((value) => String(value || '').trim()).filter(Boolean));
|
||||||
|
const targets = [];
|
||||||
|
const seen = new Set();
|
||||||
|
for (const record of records) {
|
||||||
|
const orderId = String(record.orderId || '').trim();
|
||||||
|
const pageNum = Number.parseInt(String(record.listPageNum || 0), 10) || 0;
|
||||||
|
if (!orderId || !isValidOrderId(orderId) || pageNum <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (allowSet.size > 0 && !allowSet.has(orderId)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (seen.has(orderId)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen.add(orderId);
|
||||||
|
targets.push({ orderId, pageNum });
|
||||||
|
}
|
||||||
|
return targets.sort((a, b) => a.pageNum - b.pageNum);
|
||||||
|
}
|
||||||
|
|
||||||
async function clickCustomerDetailFromList(page, target) {
|
async function clickCustomerDetailFromList(page, target) {
|
||||||
const clicked = await page.evaluate(({ accountId, loginName }) => {
|
const clicked = await page.evaluate(({ accountId, loginName }) => {
|
||||||
const normalize = (value) => String(value || '').replace(/\s+/g, '').trim();
|
const normalize = (value) => String(value || '').replace(/\s+/g, '').trim();
|
||||||
@@ -2006,6 +2048,54 @@ async function clickCustomerDetailFromListWithRetry(page, target) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function clickOrderDetailFromList(page, target) {
|
||||||
|
const clicked = await page.evaluate(({ orderId }) => {
|
||||||
|
const normalize = (value) => String(value || '').replace(/\s+/g, '').trim();
|
||||||
|
const rows = Array.from(document.querySelectorAll('table tbody tr'));
|
||||||
|
const targetRow = rows.find((row) => {
|
||||||
|
const text = normalize(row.innerText || row.textContent || '');
|
||||||
|
return text.includes(orderId);
|
||||||
|
});
|
||||||
|
if (!targetRow) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const detailButton = Array.from(targetRow.querySelectorAll('button, a, span'))
|
||||||
|
.find((node) => /详情/.test(String(node.textContent || '').trim()));
|
||||||
|
if (!detailButton) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
detailButton.scrollIntoView({ block: 'center', inline: 'center', behavior: 'instant' });
|
||||||
|
detailButton.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
|
||||||
|
return true;
|
||||||
|
}, target).catch(() => false);
|
||||||
|
|
||||||
|
if (clicked) {
|
||||||
|
await sleep(1200);
|
||||||
|
}
|
||||||
|
return clicked;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function clickOrderDetailFromListWithRetry(page, target) {
|
||||||
|
const attempts = [target.pageNum, Math.max(1, target.pageNum - 1), target.pageNum + 1];
|
||||||
|
for (const pageNum of attempts) {
|
||||||
|
if (pageNum > 0 && pageNum !== target.pageNum) {
|
||||||
|
const reached = await jumpToOrderPage(page, pageNum);
|
||||||
|
if (!reached) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
await waitForStableOrderList(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
const clicked = await clickOrderDetailFromList(page, target);
|
||||||
|
if (clicked) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
async function jumpToCustomerPage(page, pageNum) {
|
async function jumpToCustomerPage(page, pageNum) {
|
||||||
const reached = await jumpToPage(page, pageNum);
|
const reached = await jumpToPage(page, pageNum);
|
||||||
if (reached) {
|
if (reached) {
|
||||||
@@ -2014,12 +2104,26 @@ async function jumpToCustomerPage(page, pageNum) {
|
|||||||
return reached;
|
return reached;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function jumpToOrderPage(page, pageNum) {
|
||||||
|
const reached = await jumpToPage(page, pageNum);
|
||||||
|
if (reached) {
|
||||||
|
console.log(`[订单详情] 已跳转到第 ${pageNum} 页`);
|
||||||
|
}
|
||||||
|
return reached;
|
||||||
|
}
|
||||||
|
|
||||||
async function waitForStableCustomerList(page) {
|
async function waitForStableCustomerList(page) {
|
||||||
await waitForTableRows(page).catch(() => null);
|
await waitForTableRows(page).catch(() => null);
|
||||||
await sleep(600);
|
await sleep(600);
|
||||||
await waitForTableRows(page).catch(() => null);
|
await waitForTableRows(page).catch(() => null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function waitForStableOrderList(page) {
|
||||||
|
await waitForTableRows(page).catch(() => null);
|
||||||
|
await sleep(600);
|
||||||
|
await waitForTableRows(page).catch(() => null);
|
||||||
|
}
|
||||||
|
|
||||||
async function recoverCustomerListState(page, pageNum) {
|
async function recoverCustomerListState(page, pageNum) {
|
||||||
await waitUntilReady(page, datasets.customers.heading).catch(() => null);
|
await waitUntilReady(page, datasets.customers.heading).catch(() => null);
|
||||||
await trySetPageSize(page, datasets.customers.pageSize).catch(() => null);
|
await trySetPageSize(page, datasets.customers.pageSize).catch(() => null);
|
||||||
@@ -2029,6 +2133,15 @@ async function recoverCustomerListState(page, pageNum) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function recoverOrderListState(page, pageNum) {
|
||||||
|
await waitUntilReady(page, datasets.orders.heading).catch(() => null);
|
||||||
|
await trySetPageSize(page, datasets.orders.pageSize).catch(() => null);
|
||||||
|
if (pageNum > 0) {
|
||||||
|
await jumpToOrderPage(page, pageNum).catch(() => null);
|
||||||
|
await waitForStableOrderList(page).catch(() => null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function isValidOrderId(orderId) {
|
function isValidOrderId(orderId) {
|
||||||
const value = String(orderId || '').trim();
|
const value = String(orderId || '').trim();
|
||||||
if (!value) return false;
|
if (!value) return false;
|
||||||
|
|||||||
Reference in New Issue
Block a user