断点爬取
This commit is contained in:
@@ -491,7 +491,27 @@ class APSSyncer:
|
||||
return [cast(JsonDict, record) for record in data_list if isinstance(record, dict)]
|
||||
return []
|
||||
|
||||
def resolve_data_files(self, data_dir: str, sync_target: SyncTarget = SYNC_TARGET_ALL) -> tuple[Path, Path, Path, Path, Path]:
|
||||
def resolve_latest_bills_checkpoint(self, data_dir: str) -> Path:
|
||||
current_root = Path(data_dir)
|
||||
checkpoint_root = current_root.parent / "checkpoints" / "bills"
|
||||
if not checkpoint_root.exists() or not checkpoint_root.is_dir():
|
||||
raise FileNotFoundError(f"Bills checkpoint directory not found: {checkpoint_root}")
|
||||
|
||||
candidates = sorted(
|
||||
checkpoint_root.glob("*.json"),
|
||||
key=lambda item: item.stat().st_mtime,
|
||||
reverse=True,
|
||||
)
|
||||
if not candidates:
|
||||
raise FileNotFoundError(f"No bills checkpoint file found in: {checkpoint_root}")
|
||||
return candidates[0]
|
||||
|
||||
def resolve_data_files(
|
||||
self,
|
||||
data_dir: str,
|
||||
sync_target: SyncTarget = SYNC_TARGET_ALL,
|
||||
from_checkpoint: bool = False,
|
||||
) -> tuple[Path, Path, Path, Path, Path]:
|
||||
root = Path(data_dir)
|
||||
if not root.exists() or not root.is_dir():
|
||||
raise FileNotFoundError(f"Data directory not found: {root}")
|
||||
@@ -499,9 +519,12 @@ class APSSyncer:
|
||||
customers_file = root / "customers.json"
|
||||
orders_file = root / "orders.json"
|
||||
order_details_file = root / "orderDetails.json"
|
||||
bills_file = root / "bills.json"
|
||||
bills_file = self.resolve_latest_bills_checkpoint(data_dir) if from_checkpoint else root / "bills.json"
|
||||
customer_details_file = root / "customerDetails.json"
|
||||
|
||||
if from_checkpoint and sync_target != SYNC_TARGET_BILLS:
|
||||
raise ValueError("--from-checkpoint 目前仅支持 --sync-target bills")
|
||||
|
||||
required_files_by_target = {
|
||||
SYNC_TARGET_ALL: (customers_file, orders_file, order_details_file, bills_file),
|
||||
SYNC_TARGET_CUSTOMER: (customers_file,),
|
||||
@@ -954,12 +977,19 @@ class APSSyncer:
|
||||
self.stats["bills"] += 1
|
||||
|
||||
# ---- Main sync entry ----
|
||||
def sync_from_json(self, data_dir: str, incremental: bool = False, sync_target: str = SYNC_TARGET_ALL) -> StatsDict:
|
||||
def sync_from_json(
|
||||
self,
|
||||
data_dir: str,
|
||||
incremental: bool = False,
|
||||
sync_target: str = SYNC_TARGET_ALL,
|
||||
from_checkpoint: bool = False,
|
||||
) -> StatsDict:
|
||||
start = datetime.now()
|
||||
normalized_sync_target = normalize_sync_target(sync_target)
|
||||
customers_file, orders_file, order_details_file, bills_file, customer_details_file = self.resolve_data_files(
|
||||
data_dir,
|
||||
normalized_sync_target,
|
||||
from_checkpoint,
|
||||
)
|
||||
logger.info(
|
||||
"Loading source files from %s%s%s",
|
||||
@@ -1129,11 +1159,18 @@ def main() -> None:
|
||||
default=SYNC_TARGET_ALL,
|
||||
help="选择同步对象: all/customer/order/orderdetails/bills",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--from-checkpoint",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="仅对 bills 生效:直接从 data/checkpoints/bills 最新 checkpoint 文件入库",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
data_dir = cast(str, args.dir)
|
||||
incremental = cast(bool, args.incremental)
|
||||
latest_bill_consumption_time = cast(bool, args.latest_bill_consumption_time)
|
||||
sync_target = cast(str, args.sync_target)
|
||||
from_checkpoint = cast(bool, args.from_checkpoint)
|
||||
|
||||
syncer = APSSyncer(db_config=DB_CONFIG)
|
||||
if latest_bill_consumption_time:
|
||||
@@ -1145,7 +1182,7 @@ def main() -> None:
|
||||
return
|
||||
finally:
|
||||
syncer.close()
|
||||
_ = syncer.sync_from_json(data_dir, incremental=incremental, sync_target=sync_target)
|
||||
_ = syncer.sync_from_json(data_dir, incremental=incremental, sync_target=sync_target, from_checkpoint=from_checkpoint)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user