利用龙虾Claw自动提取电商订单信息并生成发货清单高效场景实践

利用龙虾Claw自动提取电商订单信息并生成发货清单高效场景实践 利用龙虾 Claw 自动提取电商订单信息并生成发货清单高效场景实践一、背景与需求分析在电商运营中订单处理是核心业务环节。无论是淘宝、京东、拼多多还是自建商城每天都会产生大量订单数据。传统的订单处理方式面临以下痛点平台分散多平台订单需要分别登录查看数据格式不一不同平台导出的订单格式各异人工整理耗时手动复制粘贴订单信息效率低发货清单制作繁琐需要逐条整理生成发货单信息核对困难地址、商品信息容易出错龙虾 Claw 结合 OCR 识别和数据处理能力可以实现电商订单信息的自动提取与发货清单生成大幅提升电商运营效率。二、整体解决方案2.1 技术架构订单数据源 → 数据采集 → 信息提取 → 数据清洗 → 清单生成 → 打印输出 ↓ ↓ ↓ ↓ ↓ ↓ 平台截图 OCR识别 字段解析 格式统一 模板渲染 PDF/Excel 导出文件 表格解析 商品匹配 地址校验 批量处理 快递对接 API接口 数据抓取 库存关联 异常标记 自动分单 物流推送2.2 支持的电商平台平台数据来源特殊处理淘宝/天猫订单截图、导出Excel淘宝订单号格式京东订单导出、商家后台JD订单号识别拼多多商家后台截图拼多多订单格式抖音电商订单导出抖音订单特征自建商城数据库/API自定义字段映射三、环境准备3.1 依赖安装# requirements.txt pdfplumber0.9.0 pandas2.0.0 openpyxl3.1.0 Pillow9.0.0 jinja23.1.0 python-dotenv1.0.0 requests2.28.0 qrcode7.4.03.2 项目结构order_processor/ ├── config/ │ ├── settings.py # 全局配置 │ └── platform_rules.yaml # 平台规则配置 ├── collectors/ │ ├── screenshot_collector.py # 截图采集 │ └── excel_collector.py # Excel采集 ├── extractors/ │ ├── order_extractor.py # 订单提取 │ └── product_extractor.py # 商品提取 ├── processors/ │ ├── data_cleaner.py # 数据清洗 │ └── address_parser.py # 地址解析 ├── generators/ │ ├── shipping_list.py # 发货清单生成 │ └── label_generator.py # 快递单生成 └── main.py # 主程序入口四、核心代码实现4.1 配置管理# config/settings.py import os from dataclasses import dataclass from typing import Dict, List dataclass class PlatformConfig: 平台配置 name: str order_id_pattern: str fields: Dict[str, str] date_format: str class Settings: 全局配置 # 平台配置 PLATFORMS { taobao: PlatformConfig( name淘宝/天猫, order_id_patternr\d{15,20}, fields{ order_id: 订单编号, buyer: 买家昵称, product: 商品名称, quantity: 数量, price: 单价, total: 实付款, address: 收货地址, phone: 联系电话, status: 订单状态 }, date_format%Y-%m-%d %H:%M:%S ), jd: PlatformConfig( name京东, order_id_patternr\d{11,15}, fields{ order_id: 订单号, buyer: 下单账号, product: 商品, quantity: 购买数量, price: 商品金额, total: 订单金额, address: 收货人地址, phone: 收货人电话, status: 订单状态 }, date_format%Y-%m-%d %H:%M:%S ), pdd: PlatformConfig( name拼多多, order_id_patternr\d{13,18}, fields{ order_id: 订单编号, buyer: 买家信息, product: 商品信息, quantity: 商品数量, price: 商品单价, total: 订单金额, address: 收货地址, phone: 手机号, status: 订单状态 }, date_format%Y-%m-%d %H:%M:%S ) } # 地址解析规则 ADDRESS_PATTERNS { province: r(北京|天津|上海|重庆|河北|山西|辽宁|吉林|黑龙江|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|广东|海南|四川|贵州|云南|陕西|甘肃|青海|台湾|内蒙古|广西|西藏|宁夏|新疆|香港|澳门), city: r([\u4e00-\u9fa5]{2,}(?:市|地区|自治州)), district: r([\u4e00-\u9fa5]{2,}(?:区|县|市)), detail: r([\u4e00-\u9fa5\d\-#号路街道小区楼栋单元室]) } # 输出配置 OUTPUT_DIR os.getenv(OUTPUT_DIR, ./output) # 快递配置 EXPRESS_COMPANIES { SF: 顺丰速运, YTO: 圆通速递, ZTO: 中通快递, STO: 申通快递, YD: 韵达快递, EMS: EMS, JD: 京东物流 }4.2 订单信息提取器# extractors/order_extractor.py import re from typing import Dict, List, Optional from dataclasses import dataclass from datetime import datetime dataclass class Order: 订单信息 order_id: str platform: str buyer: str products: List[Dict] total_amount: float address: str phone: str status: str create_time: str remarks: str class OrderExtractor: 订单信息提取器 def __init__(self, platform: str taobao): from config.settings import Settings self.platform_config Settings.PLATFORMS.get(platform) self.platform platform def extract_from_text(self, text: str) - List[Order]: 从文本提取订单信息 orders [] # 分割订单块 order_blocks self._split_order_blocks(text) for block in order_blocks: order self._parse_order_block(block) if order: orders.append(order) return orders def extract_from_table(self, table_data: List[List[str]]) - List[Order]: 从表格数据提取订单 orders [] if not table_data: return orders # 获取表头 headers table_data[0] col_map self._create_column_map(headers) # 解析数据行 for row in table_data[1:]: order self._parse_table_row(row, col_map) if order: orders.append(order) return orders def _split_order_blocks(self, text: str) - List[str]: 分割订单块 # 使用订单号作为分割标识 pattern self.platform_config.order_id_pattern matches list(re.finditer(pattern, text)) blocks [] for i, match in enumerate(matches): start match.start() end matches[i 1].start() if i 1 len(matches) else len(text) blocks.append(text[start:end]) return blocks def _parse_order_block(self, block: str) - Optional[Order]: 解析订单块 try: # 提取订单号 order_id re.search(self.platform_config.order_id_pattern, block) if not order_id: return None order_id order_id.group() # 提取其他字段 fields self.platform_config.fields buyer self._extract_field(block, fields.get(buyer, )) address self._extract_field(block, fields.get(address, )) phone self._extract_phone(block) total self._extract_amount(block, fields.get(total, )) status self._extract_field(block, fields.get(status, )) # 提取商品信息 products self._extract_products(block) return Order( order_idorder_id, platformself.platform, buyerbuyer, productsproducts, total_amounttotal, addressaddress, phonephone, statusstatus, create_time, remarks ) except Exception as e: print(f解析订单失败: {e}) return None def _create_column_map(self, headers: List[str]) - Dict[str, int]: 创建列映射 col_map {} fields self.platform_config.fields for i, header in enumerate(headers): header_lower str(header).lower().strip() for field_name, field_label in fields.items(): if field_label in header_lower or field_name in header_lower: col_map[field_name] i break return col_map def _parse_table_row(self, row: List[str], col_map: Dict) - Optional[Order]: 解析表格行 try: order_id row[col_map.get(order_id, 0)] if col_map.get(order_id, 0) len(row) else if not order_id or not re.match(self.platform_config.order_id_pattern, str(order_id)): return None buyer row[col_map.get(buyer, 1)] if col_map.get(buyer, 1) len(row) else address row[col_map.get(address, 6)] if col_map.get(address, 6) len(row) else phone row[col_map.get(phone, 7)] if col_map.get(phone, 7) len(row) else total self._parse_amount(row[col_map.get(total, 5)] if col_map.get(total, 5) len(row) else 0) status row[col_map.get(status, 8)] if col_map.get(status, 8) len(row) else # 商品信息 product_name row[col_map.get(product, 2)] if col_map.get(product, 2) len(row) else quantity self._parse_amount(row[col_map.get(quantity, 3)] if col_map.get(quantity, 3) len(row) else 1) price self._parse_amount(row[col_map.get(price, 4)] if col_map.get(price, 4) len(row) else 0) products [{ name: product_name, quantity: int(quantity), price: price }] return Order( order_idstr(order_id), platformself.platform, buyerstr(buyer), productsproducts, total_amounttotal, addressstr(address), phonestr(phone), statusstr(status), create_time, remarks ) except Exception as e: return None def _extract_field(self, text: str, field_label: str) - str: 提取字段值 if not field_label: return # 尝试多种模式 patterns [ rf{field_label}[:]\s*([^\n]), rf{field_label}\s*[:]\s*([^\n]), rf{field_label}[:]?\s*([^\n]?)(?\s{field_label}|$) ] for pattern in patterns: match re.search(pattern, text) if match: return match.group(1).strip() return def _extract_phone(self, text: str) - str: 提取电话号码 # 手机号 match re.search(r1[3-9]\d{9}, text) if match: return match.group() # 固定电话 match re.search(r\d{3,4}-\d{7,8}, text) if match: return match.group() return def _extract_amount(self, text: str, field_label: str) - float: 提取金额 value self._extract_field(text, field_label) return self._parse_amount(value) def _parse_amount(self, value: str) - float: 解析金额 if not value: return 0.0 # 移除货币符号 value re.sub(r[¥$€,\s], , str(value)) try: return float(value) except ValueError: return 0.0 def _extract_products(self, text: str) - List[Dict]: 提取商品信息 products [] # 简单的商品提取逻辑 # 实际应用中需要更复杂的解析 # 查找商品名称模式 product_patterns [ r商品[名称]*[:]\s*([^\n]), r货品[:]\s*([^\n]) ] for pattern in product_patterns: matches re.findall(pattern, text) for match in matches: products.append({ name: match.strip(), quantity: 1, price: 0.0 }) return products if products else [{name: , quantity: 1, price: 0.0}] # processors/address_parser.py import re from typing import Dict, Optional from dataclasses import dataclass dataclass class ParsedAddress: 解析后的地址 province: str city: str district: str detail: str receiver: str phone: str full_address: str class AddressParser: 地址解析器 def __init__(self): from config.settings import Settings self.patterns Settings.ADDRESS_PATTERNS def parse(self, address_text: str) - ParsedAddress: 解析地址 # 提取省份 province self._extract_province(address_text) # 提取城市 city self._extract_city(address_text) # 提取区县 district self._extract_district(address_text) # 提取详细地址 detail self._extract_detail(address_text, province, city, district) # 提取收件人和电话 receiver, phone self._extract_receiver_info(address_text) return ParsedAddress( provinceprovince, citycity, districtdistrict, detaildetail, receiverreceiver, phonephone, full_addressaddress_text ) def _extract_province(self, text: str) - str: 提取省份 match re.search(self.patterns[province], text) return match.group(1) if match else def _extract_city(self, text: str) - str: 提取城市 match re.search(self.patterns[city], text) return match.group(1) if match else def _extract_district(self, text: str) - str: 提取区县 match re.search(self.patterns[district], text) return match.group(1) if match else def _extract_detail(self, text: str, province: str, city: str, district: str) - str: 提取详细地址 # 移除已提取的部分 detail text for part in [province, city, district]: if part: detail detail.replace(part, , 1) # 清理 detail re.sub(r[\s,], , detail).strip() # 移除收件人信息 detail re.sub(r[(].*?[)], , detail) detail re.sub(r1[3-9]\d{9}, , detail) detail re.sub(r\d{3,4}-\d{7,8}, , detail) return detail.strip() def _extract_receiver_info(self, text: str) - tuple: 提取收件人信息 # 提取电话 phone_match re.search(r1[3-9]\d{9}, text) phone phone_match.group() if phone_match else # 提取收件人通常在电话前面 receiver if phone: before_phone text[:phone_match.start()] # 查找最后一个中文名字 name_match re.search(r([\u4e00-\u9fa5]{2,4})\s*$, before_phone) if name_match: receiver name_match.group(1) return receiver, phone def validate(self, address: ParsedAddress) - Dict: 验证地址完整性 issues [] if not address.province: issues.append(缺少省份信息) if not address.city: issues.append(缺少城市信息) if not address.detail: issues.append(缺少详细地址) if not address.phone: issues.append(缺少联系电话) if not address.receiver: issues.append(缺少收件人姓名) return { valid: len(issues) 0, issues: issues }4.3 发货清单生成器# generators/shipping_list.py import pandas as pd from openpyxl import Workbook from openpyxl.styles import Font, Alignment, Border, Side, PatternFill from openpyxl.utils import get_column_letter from typing import List from datetime import datetime from pathlib import Path from jinja2 import Template class ShippingListGenerator: 发货清单生成器 def __init__(self, output_dir: str ./output): self.output_dir Path(output_dir) self.output_dir.mkdir(parentsTrue, exist_okTrue) def generate_excel(self, orders: List, filename: str None) - str: 生成Excel发货清单 wb Workbook() ws wb.active ws.title 发货清单 # 标题 ws.merge_cells(A1:K1) ws[A1] f发货清单 - {datetime.now().strftime(%Y-%m-%d)} ws[A1].font Font(size16, boldTrue) ws[A1].alignment Alignment(horizontalcenter) # 表头 headers [序号, 订单号, 平台, 买家, 商品名称, 数量, 金额, 收件人, 电话, 地址, 备注] for col, header in enumerate(headers, 1): cell ws.cell(row3, columncol, valueheader) cell.font Font(boldTrue, colorFFFFFF) cell.fill PatternFill(start_color4472C4, end_color4472C4, fill_typesolid) cell.alignment Alignment(horizontalcenter) # 数据 for row_idx, order in enumerate(orders, 4): ws.cell(rowrow_idx, column1, valuerow_idx - 3) ws.cell(rowrow_idx, column2, valueorder.order_id) ws.cell(rowrow_idx, column3, valueorder.platform) ws.cell(rowrow_idx, column4, valueorder.buyer) # 商品信息 product_names ; .join([p[name] for p in order.products if p.get(name)]) quantities sum([p.get(quantity, 1) for p in order.products]) ws.cell(rowrow_idx, column5, valueproduct_names) ws.cell(rowrow_idx, column6, valuequantities) ws.cell(rowrow_idx, column7, valueorder.total_amount) ws.cell(rowrow_idx, column8, value) # 收件人需要从地址解析 ws.cell(rowrow_idx, column9, valueorder.phone) ws.cell(rowrow_idx, column10, valueorder.address) ws.cell(rowrow_idx, column11, valueorder.remarks) # 设置样式 self._apply_style(ws, len(orders) 3, len(headers)) # 保存 if not filename: filename f发货清单_{datetime.now().strftime(%Y%m%d_%H%M%S)}.xlsx output_path self.output_dir / filename wb.save(output_path) return str(output_path) def generate_pdf(self, orders: List, filename: str None) - str: 生成PDF发货清单 # 使用HTML模板生成PDF html_content self._render_html(orders) # 这里可以使用wkhtmltopdf或其他工具转换 # 简化处理保存为HTML if not filename: filename f发货清单_{datetime.now().strftime(%Y%m%d_%H%M%S)}.html output_path self.output_dir / filename with open(output_path, w, encodingutf-8) as f: f.write(html_content) return str(output_path) def _render_html(self, orders: List) - str: 渲染HTML模板 template_str !DOCTYPE html html head meta charsetutf-8 title发货清单/title style body { font-family: Arial, sans-serif; padding: 20px; } h1 { text-align: center; } table { width: 100%; border-collapse: collapse; margin-top: 20px; } th, td { border: 1px solid #ddd; padding: 8px; text-align: left; } th { background-color: #4472C4; color: white; } tr:nth-child(even) { background-color: #f2f2f2; } .summary { margin-top: 20px; font-weight: bold; } /style /head body h1发货清单/h1 p生成时间{{ date }}/p table tr th序号/th th订单号/th th平台/th th买家/th th商品/th th数量/th th金额/th th电话/th th地址/th /tr {% for order in orders %} tr td{{ loop.index }}/td td{{ order.order_id }}/td td{{ order.platform }}/td td{{ order.buyer }}/td td{{ order.products | map(attributename) | join(; ) }}/td td{{ order.products | sum(attributequantity) }}/td td{{ order.total_amount }}/td td{{ order.phone }}/td td{{ order.address }}/td /tr {% endfor %} /table div classsummary p订单总数{{ orders | length }} 单/p p商品总数{{ total_quantity }} 件/p p金额合计¥{{ total_amount | round(2) }}/p /div /body /html template Template(template_str) total_quantity sum(sum(p.get(quantity, 1) for p in o.products) for o in orders) total_amount sum(o.total_amount for o in orders) return template.render( ordersorders, datedatetime.now().strftime(%Y-%m-%d %H:%M:%S), total_quantitytotal_quantity, total_amounttotal_amount ) def _apply_style(self, ws, rows: int, cols: int): 应用样式 thin_border Border( leftSide(stylethin), rightSide(stylethin), topSide(stylethin), bottomSide(stylethin) ) for row in range(3, rows 1): for col in range(1, cols 1): cell ws.cell(rowrow, columncol) cell.border thin_border cell.alignment Alignment( horizontalcenter, verticalcenter, wrap_textTrue ) # 设置列宽 widths [6, 18, 8, 12, 25, 6, 10, 10, 12, 30, 15] for i, width in enumerate(widths): ws.column_dimensions[get_column_letter(i 1)].width width # generators/label_generator.py import qrcode from PIL import Image, ImageDraw, ImageFont from typing import List from pathlib import Path from datetime import datetime class ExpressLabelGenerator: 快递单标签生成器 def __init__(self, output_dir: str ./output): self.output_dir Path(output_dir) self.output_dir.mkdir(parentsTrue, exist_okTrue) def generate_labels(self, orders: List, express_company: str SF) - List[str]: 生成快递单标签 labels [] for order in orders: label_path self._generate_single_label(order, express_company) labels.append(label_path) return labels def _generate_single_label(self, order, express_company: str) - str: 生成单个快递单标签 # 创建标签图像 width, height 400, 300 image Image.new(RGB, (width, height), white) draw ImageDraw.Draw(image) # 尝试加载字体 try: font_large ImageFont.truetype(/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf, 16) font_small ImageFont.truetype(/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf, 12) except: font_large ImageFont.load_default() font_small ImageFont.load_default() # 绘制边框 draw.rectangle([5, 5, width-5, height-5], outlineblack, width2) # 快递公司 draw.text((20, 15), f快递公司: {express_company}, fillblack, fontfont_large) # 收件人信息 y 50 draw.text((20, y), f收件人: {order.buyer}, fillblack, fontfont_small) y 25 draw.text((20, y), f电话: {order.phone}, fillblack, fontfont_small) y 25 # 地址换行处理 address order.address if len(address) 30: address address[:30] \n address[30:] draw.text((20, y), f地址: {address}, fillblack, fontfont_small) # 订单号 y 180 draw.text((20, y), f订单号: {order.order_id}, fillblack, fontfont_small) # 生成二维码 qr qrcode.QRCode(version1, box_size3, border1) qr.add_data(order.order_id) qr.make(fitTrue) qr_img qr.make_image(fill_colorblack, back_colorwhite) # 粘贴二维码 qr_pos (width - 100, height - 100) image.paste(qr_img, qr_pos) # 保存 filename flabel_{order.order_id}.png output_path self.output_dir / filename image.save(output_path) return str(output_path)4.4 Claw 工作流配置# claw_workflows/order_processor.yaml name: 电商订单自动处理 version: 1.0 description: 自动提取电商订单信息并生成发货清单 triggers: - type: file_watcher path: ./orders/input extensions: [.xlsx, .xls, .csv, .png, .jpg] - type: schedule cron: 0 9,15,18 * * * # 每天9点、15点、18点 - type: manual variables: input_dir: ./orders/input output_dir: ./orders/output steps: - name: 扫描订单文件 action: scan_files config: path: ${input_dir} extensions: [.xlsx, .xls, .csv, .png, .jpg] output: order_files - name: 识别平台类型 action: detect_platform config: files: ${order_files} output: platform_info - name: 提取订单信息 action: extract_orders config: files: ${order_files} platform_info: ${platform_info} output: orders - name: 解析地址信息 action: parse_address config: orders: ${orders} output: parsed_orders - name: 数据校验 action: validate_orders config: orders: ${parsed_orders} check_address: true check_phone: true output: validated_orders - name: 生成发货清单 action: generate_shipping_list config: orders: ${validated_orders} output_dir: ${output_dir} format: [excel, pdf] output: shipping_list - name: 生成快递标签 action: generate_labels config: orders: ${validated_orders} express_company: SF output_dir: ${output_dir} output: labels - name: 发送通知 action: notify config: type: email recipients: [warehousecompany.com] subject: 发货清单已生成 body: 共${validated_orders.count}个订单待发货 attachments: [${shipping_list.excel}]五、实战运行示例5.1 主程序入口# main.py import argparse from pathlib import Path from datetime import datetime from extractors.order_extractor import OrderExtractor from processors.address_parser import AddressParser from generators.shipping_list import ShippingListGenerator from generators.label_generator import ExpressLabelGenerator def main(): parser argparse.ArgumentParser(description龙虾Claw订单处理系统) parser.add_argument(--input, -i, requiredTrue, help输入文件或目录) parser.add_argument(--output, -o, default./output, help输出目录) parser.add_argument(--platform, -p, defaulttaobao, help电商平台) parser.add_argument(--labels, -l, actionstore_true, help生成快递标签) args parser.parse_args() print( 龙虾Claw订单处理系统 ) print(f输入路径: {args.input}) print(f平台: {args.platform}) print(- * 40) # 初始化组件 extractor OrderExtractor(platformargs.platform) address_parser AddressParser() list_generator ShippingListGenerator(args.output) label_generator ExpressLabelGenerator(args.output) # 收集文件 input_path Path(args.input) files [] if input_path.is_file(): files [input_path] elif input_path.is_dir(): for ext in [*.xlsx, *.xls, *.csv]: files.extend(input_path.glob(ext)) print(f\n发现 {len(files)} 个订单文件) # 提取订单 all_orders [] for file_path in files: print(f\n处理: {file_path.name}) try: if file_path.suffix.lower() in [.xlsx, .xls]: import pandas as pd df pd.read_excel(file_path) table_data [df.columns.tolist()] df.values.tolist() orders extractor.extract_from_table(table_data) else: with open(file_path, r, encodingutf-8) as f: text f.read() orders extractor.extract_from_text(text) all_orders.extend(orders) print(f - 提取 {len(orders)} 个订单) except Exception as e: print(f - 处理失败: {e}) # 解析地址 print(f\n解析地址信息...) for order in all_orders: if order.address: parsed address_parser.parse(order.address) # 更新地址信息 # 生成发货清单 print(f\n生成发货清单...) list_path list_generator.generate_excel(all_orders) print(f清单已生成: {list_path}) # 生成快递标签 if args.labels: print(f\n生成快递标签...) labels label_generator.generate_labels(all_orders) print(f已生成 {len(labels)} 个标签) # 统计 total_amount sum(o.total_amount for o in all_orders) total_quantity sum(sum(p.get(quantity, 1) for p in o.products) for o in all_orders) print(f\n 统计信息 ) print(f订单数量: {len(all_orders)} 单) print(f商品数量: {total_quantity} 件) print(f金额合计: ¥{total_amount:,.2f}) print(\n处理完成) if __name__ __main__: main()5.2 运行命令# 处理淘宝订单 python main.py -i orders.xlsx -p taobao # 处理京东订单并生成标签 python main.py -i ./orders -p jd --labels # 指定输出目录 python main.py -i orders.csv -o ./output六、总结本文详细介绍了使用龙虾 Claw 实现电商订单信息自动提取与发货清单生成的完整方案。通过多平台支持、智能地址解析和自动化清单生成大幅提升电商运营效率。核心优势多平台支持淘宝、京东、拼多多等主流平台智能提取自动识别订单关键字段地址解析自动拆分省市区详细地址清单生成一键生成Excel/PDF发货清单标签打印自动生成快递面单标签