@@ -154,6 +154,37 @@ async function saveStorageState(context) {
console . log ( ` [storageState] 已保存登录态快照: ${ config . storageStateFile } ` ) ;
}
function loadLatestBillsCheckpoint ( ) {
const checkpointDir = path . join ( config . dataDir , 'checkpoints' , 'bills' ) ;
if ( ! fs . existsSync ( checkpointDir ) ) {
return null ;
}
const candidates = fs . readdirSync ( checkpointDir )
. filter ( ( fileName ) => fileName . endsWith ( '.json' ) )
. map ( ( fileName ) => {
const filePath = path . join ( checkpointDir , fileName ) ;
const stat = fs . statSync ( filePath ) ;
return { fileName , filePath , mtimeMs : stat . mtimeMs } ;
} )
. sort ( ( a , b ) => b . mtimeMs - a . mtimeMs ) ;
if ( candidates . length === 0 ) {
return null ;
}
try {
const latest = JSON . parse ( fs . readFileSync ( candidates [ 0 ] . filePath , 'utf-8' ) ) ;
if ( ! latest || typeof latest !== 'object' ) {
return null ;
}
return latest ;
} catch ( error ) {
console . warn ( ` [账单检查点] 读取失败,忽略断点续爬: ${ error . message } ` ) ;
return null ;
}
}
async function getPageBodyPreview ( page ) {
return page
. evaluate ( ( ) => document . body ? . innerText ? . substring ( 0 , 500 ) || '(空)' )
@@ -237,7 +268,7 @@ export async function syncAll() {
summary . datasets . orders = await syncOrders ( page ) ;
// syncOrders 完成后,从最新的 orders.json 读取 orderId 列表
const latestOrders = loadCurrentState ( 'orders' ) ;
const latestOrders = loadCurrentState ( 'orders' , datasets . orders . uniqueKey );
const orderIdsForDetail = collectValidOrderIds ( latestOrders . records || [ ] ) ;
summary . datasets . orderDetails = await syncOrderDetails ( page , orderIdsForDetail ) ;
@@ -257,7 +288,7 @@ export async function syncAll() {
}
}
export async function syncBillsOnly ( ) {
export async function syncBillsOnly ( options = { } ) {
const runtimeController = getRuntimeController ( ) ;
runtimeController . bind ( ) ;
const context = await getContext ( ) ;
@@ -266,7 +297,7 @@ export async function syncBillsOnly() {
const summary = { startedAt : new Date ( ) . toISOString ( ) , datasets : { } } ;
const page = context . pages ( ) [ 0 ] || ( await context . newPage ( ) ) ;
summary . datasets . bills = await syncBills ( page ) ;
summary . datasets . bills = await syncBills ( page , options );
summary . finishedAt = new Date ( ) . toISOString ( ) ;
const stamp = nowStamp ( ) ;
@@ -325,7 +356,7 @@ async function syncCustomers(page) {
async function syncCustomerDetails ( page ) {
await runtimeCheckpoint ( '同步客户详情' ) ;
const dataset = datasets . customerDetails ;
const customersState = loadCurrentState ( 'customers' ) ;
const customersState = loadCurrentState ( 'customers' , datasets . customers . uniqueKey );
const allAccountIds = collectValidAccountIds ( customersState . records || [ ] ) ;
if ( allAccountIds . length === 0 ) {
@@ -426,9 +457,10 @@ function normalizeConfiguredDate(value) {
return normalized ;
}
async function syncBills ( page ) {
async function syncBills ( page , options = { } ) {
await runtimeCheckpoint ( '同步账单' ) ;
const dataset = datasets . bills ;
const { resume = false } = options ;
let months ;
let latestConsumptionDate = null ;
@@ -442,7 +474,16 @@ async function syncBills(page) {
console . log ( ` [增量模式] 账单仅查询: ${ incrementalMonth } ${ latestConsumptionDate ? ` , 数据库最新消费时间: ${ latestConsumptionDate } ` : '' } ` ) ;
}
const allRecords = [ ] ;
const resumeCheckpoint = resume ? loadLatestBillsCheckpoint ( ) : null ;
if ( resumeCheckpoint ? . month ) {
const resumeIndex = months . indexOf ( resumeCheckpoint . month ) ;
if ( resumeIndex >= 0 ) {
months = months . slice ( resumeIndex ) ;
console . log ( ` [账单续爬] 从 checkpoint 恢复: month= ${ resumeCheckpoint . month } , page= ${ resumeCheckpoint . pageNum || 1 } , records= ${ ( resumeCheckpoint . records || [ ] ) . length } ` ) ;
}
}
const allNormalizedRecords = [ ] ;
for ( const month of months ) {
await runtimeCheckpoint ( ` 账单月份 ${ month } ` ) ;
@@ -451,30 +492,54 @@ async function syncBills(page) {
await setMonthValue ( page , month ) ;
await clickQuery ( page ) ;
await trySetPageSize ( page , dataset . pageSize ) ;
const monthRecords = [ ] ;
let records = await scrapePagedTable ( page , dataset , { month } , {
onPage : async ( { pageData , pageNum } ) => {
monthRecords . push ( ... pageData . rows . map ( ( row ) => ( { ... row , _ _context : { month } } ) ) ) ;
let checkpointRecords = monthRecords ;
if ( latestConsumptionDate ) {
checkpointRecords = monthRecords . filter ( ( record ) => isAfterLatestConsumptionDate ( record , latestConsumptionDate ) ) ;
let monthNormalizedRecords = [ ] ;
let resumeFromPage = 0 ;
let shouldContinueScrape = true ;
if ( resumeCheckpoint ? . month === month) {
monthNormalizedRecords = Array . isArray ( resumeCheckpoint . records ) ? resumeCheckpoint . records : [ ] ;
resumeFromPage = Number . parseInt ( String ( resumeCheckpoint . pageNum || 0 ) , 10 ) || 0 ;
if ( resumeFromPage > 0 ) {
const moved = await moveBillsToResumeStart ( page , resumeFromPage ) ;
if ( ! moved ) {
console . log ( ` [账单续爬] checkpoint 已在最后一页,无需继续抓取 month= ${ month } ` ) ;
shouldContinueScrape = false ;
}
await saveBillsCheckpoint ( dataset , month , pageNum , checkpointRecords ) ;
} ,
} ) ;
if ( latestConsumptionDate ) {
const before = records . length ;
records = records . filter ( ( record ) => isAfterLatestConsumptionDate ( record , latestConsumptionDate ) ) ;
console . log ( ` [增量模式] 账单按消费时间过滤: ${ before } -> ${ records . length } ` ) ;
}
}
allRecords . push ( ... records ) ;
let rawRecords = [ ] ;
if ( shouldContinueScrape ) {
rawRecords = await scrapePagedTable ( page , dataset , { month } , {
onPage : async ( { pageNum , pageRows } ) => {
const normalizedPageRows = normalizeDatasetRecords ( dataset , pageRows , { month } ) ;
monthNormalizedRecords . push ( ... normalizedPageRows ) ;
let checkpointRecords = monthNormalizedRecords ;
if ( latestConsumptionDate ) {
checkpointRecords = monthNormalizedRecords . filter ( ( record ) => isAfterLatestConsumptionDate ( record , latestConsumptionDate ) ) ;
}
await saveBillsCheckpoint ( dataset , month , pageNum , checkpointRecords ) ;
} ,
} ) ;
}
if ( resumeFromPage === 0 ) {
monthNormalizedRecords = normalizeDatasetRecords ( dataset , rawRecords , { month } ) ;
}
if ( latestConsumptionDate ) {
const before = monthNormalizedRecords . length ;
monthNormalizedRecords = monthNormalizedRecords . filter ( ( record ) => isAfterLatestConsumptionDate ( record , latestConsumptionDate ) ) ;
console . log ( ` [增量模式] 账单按消费时间过滤: ${ before } -> ${ monthNormalizedRecords . length } ` ) ;
}
allNormalizedRecords . push ( ... monthNormalizedRecords ) ;
}
return persistDataset ( dataset , dedupeByHash ( allRecords ), { } ) ;
return persistNormalized Dataset ( dataset , dedupeByHash ( allNormalized Records ) ) ;
}
async function saveBillsCheckpoint ( dataset , month , pageNum , raw Records) {
const normalized = dedupeByHash ( rawRecords . map ( ( record ) => dataset . normalize ( record , record . _ _context || { } ) ) . map ( withHash ) ) ;
async function saveBillsCheckpoint ( dataset , month , pageNum , normalized Records) {
const normalized = dedupeByHash ( normalizedRecords ) ;
const checkpointName = ` ${ month } -latest ` ;
saveCheckpoint ( dataset . name , checkpointName , {
month ,
@@ -488,6 +553,24 @@ async function saveBillsCheckpoint(dataset, month, pageNum, rawRecords) {
console . log ( ` [账单检查点] 已落盘: month= ${ month } , page= ${ pageNum } , records= ${ normalized . length } ` ) ;
}
function normalizeDatasetRecords ( dataset , records , context ) {
return records . map ( ( record ) => withHash ( dataset . normalize ( record , record . _ _context || context ) ) ) ;
}
async function moveBillsToResumeStart ( page , resumeFromPage ) {
if ( resumeFromPage <= 0 ) {
return true ;
}
const reached = await jumpToPage ( page , resumeFromPage ) ;
if ( ! reached ) {
throw new Error ( ` 账单续爬失败:无法定位到 checkpoint 页码 ${ resumeFromPage } ` ) ;
}
const moved = await gotoNextPage ( page ) ;
return moved ;
}
function getLatestBillConsumptionDate ( ) {
const scriptPath = path . resolve ( config . rootDir , config . dbSyncScript ) ;
try {
@@ -561,9 +644,13 @@ async function syncOrderDetails(page, cachedOrderIds) {
}
function persistDataset ( dataset , records , context ) {
const normalized = records . map ( ( record ) => withHash ( dataset . normalize ( record , record . _ _context || context ) ) ) ;
const previousState = loadCurrentState ( dataset . name ) ;
const nextState = diffRecords ( previousState , normalized , dataset . uniqueKey ) ;
const normalized = normalizeDatasetRecords ( dataset , records , context ) ;
return persistNormalizedDataset ( dataset , normalized ) ;
}
function persistNormalizedDataset ( dataset , normalizedRecords ) {
const previousState = loadCurrentState ( dataset . name , dataset . uniqueKey ) ;
const nextState = diffRecords ( previousState , normalizedRecords , dataset . uniqueKey ) ;
const stamp = saveDatasetRun ( dataset . name , nextState ) ;
saveDelta ( dataset . name , stamp , nextState . delta ) ;
return {
@@ -629,9 +716,10 @@ async function waitUntilReady(page, heading, timeout = 120000, options = {}) {
}
async function scrapePagedTable ( page , dataset , context , options = { } ) {
const { onPage } = options ;
const { onPage , skipInitialPage = false } = options ;
const pages = [ ] ;
const visited = new Set ( ) ;
let shouldSkipCurrentPage = skipInitialPage ;
while ( true ) {
await runtimeCheckpoint ( ` 抓取 ${ dataset . name } 分页 ` ) ;
@@ -640,6 +728,16 @@ async function scrapePagedTable(page, dataset, context, options = {}) {
const pageNum = await currentPageNumber ( page ) ;
const pageKey = ` ${ pageNum } - ${ pageData . rows . length } ` ;
console . log ( ` [抓取] 第 ${ pageNum } 页, ${ pageData . rows . length } 行, key=" ${ pageKey } " ` ) ;
if ( shouldSkipCurrentPage ) {
console . log ( ` [抓取] 跳过 checkpoint 已保存页: ${ pageNum } ` ) ;
shouldSkipCurrentPage = false ;
const moved = await gotoNextPage ( page ) ;
if ( ! moved ) {
console . log ( ` [抓取] checkpoint 已位于最后一页,停止 ` ) ;
break ;
}
continue ;
}
if ( visited . has ( pageKey ) ) {
console . log ( ` [抓取] 重复页面key, 停止翻页 ` ) ;
break ;
@@ -732,6 +830,56 @@ async function currentPageNumber(page) {
return Number . parseInt ( ( await active . first ( ) . innerText ( ) ) . trim ( ) , 10 ) || 1 ;
}
async function jumpToPage ( page , targetPage ) {
if ( targetPage <= 1 ) {
return true ;
}
const current = await currentPageNumber ( page ) ;
if ( current === targetPage ) {
return true ;
}
const jumpInputCandidates = [
'.next-pagination-jump-input input' ,
'input[aria-label*="页码"]' ,
'input[aria-label*="页"]' ,
] ;
for ( const selector of jumpInputCandidates ) {
const input = page . locator ( selector ) . first ( ) ;
if ( ( await input . count ( ) ) === 0 ) {
continue ;
}
await input . click ( ) . catch ( ( ) => null ) ;
await sleep ( 100 ) ;
await page . keyboard . press ( 'Control+A' ) . catch ( ( ) => null ) ;
await page . keyboard . type ( String ( targetPage ) , { delay : 20 } ) . catch ( ( ) => null ) ;
await page . keyboard . press ( 'Enter' ) . catch ( ( ) => null ) ;
await sleep ( 1500 ) ;
const afterJump = await currentPageNumber ( page ) ;
if ( afterJump === targetPage ) {
console . log ( ` [账单续爬] 已跳转到第 ${ targetPage } 页 ` ) ;
return true ;
}
}
console . warn ( ` [账单续爬] 未找到可用跳页输入框,尝试顺序翻到第 ${ targetPage } 页 ` ) ;
let guard = 0 ;
while ( guard < targetPage + 5 ) {
const currentPage = await currentPageNumber ( page ) ;
if ( currentPage >= targetPage ) {
return currentPage === targetPage ;
}
const moved = await gotoNextPage ( page ) ;
if ( ! moved ) {
return false ;
}
guard += 1 ;
}
return false ;
}
async function gotoNextPage ( page ) {
await runtimeCheckpoint ( '翻页' ) ;
const before = await currentPageNumber ( page ) ;