断点爬取

This commit is contained in:
ray
2026-04-27 09:16:07 +08:00
parent 5c1d0f3fad
commit 2e4ce07340
6 changed files with 283 additions and 38 deletions

View File

@@ -491,7 +491,27 @@ class APSSyncer:
return [cast(JsonDict, record) for record in data_list if isinstance(record, dict)]
return []
def resolve_data_files(self, data_dir: str, sync_target: SyncTarget = SYNC_TARGET_ALL) -> tuple[Path, Path, Path, Path, Path]:
def resolve_latest_bills_checkpoint(self, data_dir: str) -> Path:
current_root = Path(data_dir)
checkpoint_root = current_root.parent / "checkpoints" / "bills"
if not checkpoint_root.exists() or not checkpoint_root.is_dir():
raise FileNotFoundError(f"Bills checkpoint directory not found: {checkpoint_root}")
candidates = sorted(
checkpoint_root.glob("*.json"),
key=lambda item: item.stat().st_mtime,
reverse=True,
)
if not candidates:
raise FileNotFoundError(f"No bills checkpoint file found in: {checkpoint_root}")
return candidates[0]
def resolve_data_files(
self,
data_dir: str,
sync_target: SyncTarget = SYNC_TARGET_ALL,
from_checkpoint: bool = False,
) -> tuple[Path, Path, Path, Path, Path]:
root = Path(data_dir)
if not root.exists() or not root.is_dir():
raise FileNotFoundError(f"Data directory not found: {root}")
@@ -499,9 +519,12 @@ class APSSyncer:
customers_file = root / "customers.json"
orders_file = root / "orders.json"
order_details_file = root / "orderDetails.json"
bills_file = root / "bills.json"
bills_file = self.resolve_latest_bills_checkpoint(data_dir) if from_checkpoint else root / "bills.json"
customer_details_file = root / "customerDetails.json"
if from_checkpoint and sync_target != SYNC_TARGET_BILLS:
raise ValueError("--from-checkpoint 目前仅支持 --sync-target bills")
required_files_by_target = {
SYNC_TARGET_ALL: (customers_file, orders_file, order_details_file, bills_file),
SYNC_TARGET_CUSTOMER: (customers_file,),
@@ -954,12 +977,19 @@ class APSSyncer:
self.stats["bills"] += 1
# ---- Main sync entry ----
def sync_from_json(self, data_dir: str, incremental: bool = False, sync_target: str = SYNC_TARGET_ALL) -> StatsDict:
def sync_from_json(
self,
data_dir: str,
incremental: bool = False,
sync_target: str = SYNC_TARGET_ALL,
from_checkpoint: bool = False,
) -> StatsDict:
start = datetime.now()
normalized_sync_target = normalize_sync_target(sync_target)
customers_file, orders_file, order_details_file, bills_file, customer_details_file = self.resolve_data_files(
data_dir,
normalized_sync_target,
from_checkpoint,
)
logger.info(
"Loading source files from %s%s%s",
@@ -1129,11 +1159,18 @@ def main() -> None:
default=SYNC_TARGET_ALL,
help="选择同步对象: all/customer/order/orderdetails/bills",
)
_ = parser.add_argument(
"--from-checkpoint",
action="store_true",
default=False,
help="仅对 bills 生效:直接从 data/checkpoints/bills 最新 checkpoint 文件入库",
)
args = parser.parse_args()
data_dir = cast(str, args.dir)
incremental = cast(bool, args.incremental)
latest_bill_consumption_time = cast(bool, args.latest_bill_consumption_time)
sync_target = cast(str, args.sync_target)
from_checkpoint = cast(bool, args.from_checkpoint)
syncer = APSSyncer(db_config=DB_CONFIG)
if latest_bill_consumption_time:
@@ -1145,7 +1182,7 @@ def main() -> None:
return
finally:
syncer.close()
_ = syncer.sync_from_json(data_dir, incremental=incremental, sync_target=sync_target)
_ = syncer.sync_from_json(data_dir, incremental=incremental, sync_target=sync_target, from_checkpoint=from_checkpoint)
if __name__ == "__main__":