财务/行政狂喜!用 Python 提取 PDF 发票多行明细,十几行材料瞬间「分列」进 Excel!
已关注
关注
重播 分享 赞
核心痛点:为何发票明细这么难提?
破局思路:倒着来!(逆向拆解法)
代码升级三步走,直接抄作业
第一步:重写提取逻辑,精准拆分每一列(逆向拆解)
def extract_projects_full(text: str) -> list[dict]:results = []lines = [line.strip() for line in text.splitlines() if line.strip()]i = 0while i < len(lines):line = lines[i]if line.startswith('*'):main_line = lineextra_text = ""i += 1# 1. 收集跨行文字while i < len(lines) and not lines[i].startswith('*'):if re.search(r'合\s*计|合計|总计|總計', lines[i]):before_total = re.split(r'合\s*计|合計|总计|總計', lines[i])[0].strip()extra_text += before_totalbreakextra_text += lines[i]i += 1# 2. 清洗主行合计尾巴main_clean = re.split(r'合\s*计|合計|总计|總計', main_line)[0].strip()# 3. 接驳跨行文字if extra_text.strip():first_space_idx = main_clean.find(" ")target_idx = first_space_idxif first_space_idx != -1 and first_space_idx < 15:passelse:second_space_idx = main_clean.find(" ", first_space_idx + 1)if second_space_idx != -1:target_idx = second_space_idxif target_idx != -1:combined_line = (main_clean[:target_idx] +extra_text.strip() +" " +main_clean[target_idx + 1:])else:combined_line = main_clean + " " + extra_text.strip()else:combined_line = main_clean# 4. 统一空格清洗combined_line = re.sub(r'\s+', ' ', combined_line).strip()# ================= 新增:5. 逆向拆解明细字段 =================parts = combined_line.split()item_dict = {"项目名称": "未识别", "规格型号": "", "单位": "","数量": "", "单价": "", "明细金额": "", "明细税率": "", "明细税额": ""}# 发票规范:末尾通常依次是 金额、税率、税额if len(parts) >= 3:item_dict["明细税额"] = parts.pop()item_dict["明细税率"] = parts.pop()item_dict["明细金额"] = parts.pop()# 往前找“单价”和“数量”(特征:全数字或带小数点)if parts and re.match(r'^-?\d+(\.\d+)?$', parts[-1]):item_dict["单价"] = parts.pop()if parts and re.match(r'^-?\d+(\.\d+)?$', parts[-1]):item_dict["数量"] = parts.pop()# 剩下的部分为:项目名称、规格型号、单位if parts:# 最后一个词如果不包含数字,通常是单位(如:套、台、kg、批)if len(parts) > 1 and not re.search(r'\d', parts[-1]):item_dict["单位"] = parts.pop()item_dict["项目名称"] = parts[0]# 中间夹着的剩下的就是规格型号if len(parts) > 1:item_dict["规格型号"] = " ".join(parts[1:])results.append(item_dict)continuei += 1return results
第二步:数据展平(一对多映射)
def process_all_invoices(self):pdf_files = self.collect_pdfs()total = len(pdf_files)for idx, pdf_path in enumerate(pdf_files, 1):self.log(f"解析文件: {pdf_path}")text, tables = self.extract_text_and_tables(pdf_path)if text:info = self.parse_invoice_info(text, tables)# 提取发票公用基础信息base_info = {"文件名": os.path.basename(pdf_path),"发票号码": info["发票号码"],"开票日期": info["开票日期"],"购买方名称": info["购买方名称"],"购买方税号": info["购买方税号"],"销售方名称": info["销售方名称"],"销售方税号": info["销售方税号"],"总金额": info["金额"], # 改名以免与明细金额冲突"总税额": info["税额"], # 改名以免与明细税额冲突"价税合计": info["价税合计"]}items = info.get('_full_projects', [])# 如果没识别到任何明细,保留一行基础信息兜底if not items:empty_row = base_info.copy()empty_row.update({"项目名称": info.get("项目名称", "未识别"),"规格型号": "", "单位": "", "数量": "", "单价": "","明细金额": "", "明细税率": "", "明细税额": ""})self.invoice_list.append(empty_row)else:# 将每一个项目明细作为单独的一行写入for item in items:row = base_info.copy()row.update(item)self.invoice_list.append(row)if self.progress_callback:self.progress_callback(idx, total)
第三步:重组 Excel 列顺序,强迫症福音
# 更新列顺序:加入拆分出来的规格、单位、数量、单价ordered_cols = ["文件名", "发票号码", "开票日期","购买方名称", "购买方税号","销售方名称", "销售方税号","项目名称", "规格型号", "单位", "数量", "单价", "明细金额", "明细税率", "明细税额","总金额", "总税额", "价税合计"]# 按指定顺序排列 DataFrame,然后写入 Exceldf = df[ordered_cols]
夜雨聆风
