从非结构化文本文档中提取结构化信息的电商评论案例
从产品评论中提取结构化信息,某电商平台需要自动分析用户对产品的评价。商家想要知道:
这是一个典型的 NLP 任务,传统方法需要训练专门的模型,但 LangExtract 可以通过少样本学习快速实现。
import langextract as lx# 真实的用户评论(模拟数据)review_text = """这个耳机的音质真的很棒,低音效果很震撼。但是价格有点贵,性价比不太高。另外,发货速度很快,包装也很精美,物流服务值得点赞。"""# 提取任务描述prompt_description = """从产品评论中提取特征及其情感倾向。- feature: 产品特征(如音质、价格、物流等)- sentiment: 对该特征的情感(positive/negative/neutral)- 每个特征和情感必须配对出现,并关联到同一个 feature_group"""# 少样本示例(关键:教会模型理解任务)examples = [lx.data.ExampleData(text="这款手机的拍照效果很好,但电池续航太差了,屏幕素质不错。",extractions=[# 拍照 - 正面lx.data.Extraction(extraction_class="feature",extraction_text="拍照效果",attributes={"feature_group": "camera"}),lx.data.Extraction(extraction_class="sentiment",extraction_text="很好",attributes={"feature_group": "camera", "sentiment_type": "positive"}),# 电池 - 负面lx.data.Extraction(extraction_class="feature",extraction_text="电池续航",attributes={"feature_group": "battery"}),lx.data.Extraction(extraction_class="sentiment",extraction_text="太差",attributes={"feature_group": "battery", "sentiment_type": "negative"}),# 屏幕 - 正面lx.data.Extraction(extraction_class="feature",extraction_text="屏幕素质",attributes={"feature_group": "screen"}),lx.data.Extraction(extraction_class="sentiment",extraction_text="不错",attributes={"feature_group": "screen", "sentiment_type": "positive"})])]# 执行提取result = lx.extract(text_or_documents=review_text,prompt_description=prompt_description,examples=examples,model_id="gemini-2.5-flash", # 使用快速的模型api_key="your-api-key")# 展示结果print(f"用户评论:\n{review_text}\n")print("=" * 50)print("提取分析结果:\n")# 按特征分组展示feature_groups = {}for extraction in result.extractions:group = extraction.attributes.get("feature_group")if not group:continueif group not in feature_groups:feature_groups[group] = {"feature": None, "sentiment": None}if extraction.extraction_class == "feature":feature_groups[group]["feature"] = extractionelif extraction.extraction_class == "sentiment":feature_groups[group]["sentiment"] = extractionfor group_id, data in feature_groups.items():feature = data["feature"]sentiment = data["sentiment"]# 显示来源定位pos_info = ""if feature.char_interval:start, end = feature.char_interval.start_pos, feature.char_interval.end_pospos_info = f" (文本位置: {start}-{end})"sentiment_type = sentiment.attributes.get("sentiment_type", "unknown")emoji = {"positive": "👍", "negative": "👎", "neutral": "😐"}.get(sentiment_type, "")print(f"特征: {feature.extraction_text}{pos_info}")print(f"情感: {sentiment.extraction_text} ({sentiment_type}) {emoji}")print("-" * 40)# 生成可视化 HTMLhtml_viz = lx.visualize(result)with open("product_review_analysis.html", "w") as f:f.write(html_viz.data if hasattr(html_viz, 'data') else html_viz)print("\n✅ 交互式可视化已生成:product_review_analysis.html")
预期输出:用户评论:这个耳机的音质真的很棒,低音效果很震撼。但是价格有点贵,性价比不太高。另外,发货速度很快,包装也很精美,物流服务值得点赞。==================================================提取分析结果:特征: 音质 (文本位置: 5-7)情感: 很棒 (positive) 👍----------------------------------------特征: 价格 (文本位置: 27-29)情感: 贵 (negative) 👎----------------------------------------特征: 发货速度 (文本位置: 46-50)情感: 很快 (positive) 👍----------------------------------------
批量分析数万条产品评论(实时监控、紧急分析、5000 条以内的评论)
import langextract as lximport timeimport loggingfrom typing import List, Dictfrom dataclasses import dataclass# 配置日志logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')@dataclassclass Review:"""评论数据结构"""review_id: strtext: strproduct_id: str# 模拟加载 5000 条评论def load_reviews_from_database(limit: int = 5000) -> List[Review]:"""从数据库加载评论(模拟)"""# 实际场景:这里连接数据库查询reviews = []for i in range(limit):reviews.append(Review(review_id=f"REV{i:05d}",text=f"这个产品{'质量很好'if i % 3 == 0else'价格太高'if i % 3 == 1else'物流很快'}。",product_id="PROD-001"))return reviews# 定义提取任务prompt_description = """从产品评论中提取特征及其情感倾向。- feature: 产品特征(如音质、价格、物流、质量等)- sentiment: 对该特征的情感(positive/negative/neutral)- 每个特征和情感必须配对出现,并关联到同一个 feature_group"""# 少样本示例examples = [lx.data.ExampleData(text="这款手机的拍照效果很好,但电池续航太差了。",extractions=[lx.data.Extraction(extraction_class="feature",extraction_text="拍照效果",attributes={"feature_group": "camera"}),lx.data.Extraction(extraction_class="sentiment",extraction_text="很好",attributes={"feature_group": "camera", "sentiment_type": "positive"}),lx.data.Extraction(extraction_class="feature",extraction_text="电池续航",attributes={"feature_group": "battery"}),lx.data.Extraction(extraction_class="sentiment",extraction_text="太差",attributes={"feature_group": "battery", "sentiment_type": "negative"})])]def analyze_reviews_parallel(reviews: List[Review], max_workers: int = 10) -> Dict[str, any]:"""使用并行执行批量分析评论Args:reviews: 评论列表max_workers: 并发线程数(建议 10-20)Returns:分析结果汇总"""start_time = time.time()logging.info(f"🚀 开始分析 {len(reviews)} 条评论(并行模式)...")# 准备输入数据:只提取文本内容review_texts = [review.text for review in reviews]# 执行提取(关键参数:max_workers 控制并发)results = lx.extract(text_or_documents=review_texts,prompt_description=prompt_description,examples=examples,model_id="gemini-2.5-flash", # 使用快速模型max_workers=max_workers, # 并发 API 调用数api_key="your-api-key")# 汇总结果feature_stats = {"positive": {},"negative": {},"neutral": {}}for review, result in zip(reviews, results):# 按特征分组feature_groups = {}for extraction in result.extractions:group = extraction.attributes.get("feature_group")if not group:continueif group not in feature_groups:feature_groups[group] = {"feature": None, "sentiment": None}if extraction.extraction_class == "feature":feature_groups[group]["feature"] = extraction.extraction_textelif extraction.extraction_class == "sentiment":sentiment_type = extraction.attributes.get("sentiment_type", "neutral")feature_groups[group]["sentiment"] = extraction.extraction_textfeature_groups[group]["sentiment_type"] = sentiment_type# 统计for group, data in feature_groups.items():feature_name = data["feature"]sentiment_type = data["sentiment_type"]if sentiment_type in feature_stats:if feature_name not in feature_stats[sentiment_type]:feature_stats[sentiment_type][feature_name] = 0feature_stats[sentiment_type][feature_name] += 1elapsed = time.time() - start_timelogging.info(f"✅ 分析完成!耗时: {elapsed:.2f} 秒")return {"total_reviews": len(reviews),"elapsed_time": elapsed,"feature_stats": feature_stats,"results": results}# 运行分析if __name__ == "__main__":# 加载评论reviews = load_reviews_from_database(limit=5000)# 并行分析analysis = analyze_reviews_parallel(reviews, max_workers=10)# 打印统计结果print("\n" + "=" * 60)print("📊 评论情感分析报告")print("=" * 60)print(f"处理评论数: {analysis['total_reviews']:,}")print(f"处理耗时: {analysis['elapsed_time']:.2f} 秒")print(f"平均速度: {analysis['total_reviews'] / analysis['elapsed_time']:.2f} 条/秒")print("\n正面评价 TOP 5:")for feature, count in sorted(analysis['feature_stats']['positive'].items(),key=lambda x: x[1], reverse=True)[:5]:print(f" • {feature}: {count} 次")print("\n负面评价 TOP 5:")for feature, count in sorted(analysis['feature_stats']['negative'].items(),key=lambda x: x[1], reverse=True)[:5]:print(f" • {feature}: {count} 次")
数据规模 处理方式 耗时 成本────────────────────────────────────────────────100 条 并行执行 ~5 秒 $0.011000 条 并行执行 ~30 秒 $0.105000 条 并行执行 ~2 分钟 $0.50
夜雨聆风
