# filter.pyfrom fastapi import FastAPI, Requestfrom fastapi.responses import JSONResponseapp = FastAPI()# 输入过滤:拦截用户 Prompt 中的危险指令INPUT_BLOCK_PATTERNS = ["delete all","drop table","rm -rf","ignore previous instructions","bypass safety",]# 输出过滤:即使用户 Prompt 无害,模型也可能决定做危险的事OUTPUT_BLOCK_PATTERNS = ["rm -rf","DELETE FROM","bulk_delete","permanently delete","cannot be undone",]@app.post("/input_filter")asyncdefinput_filter(request: Request): body = await request.json()# 兼容 OpenAI messages 格式和纯文本格式 prompt = ""if"messages"in body: prompt = " ".join( m.get("content", "") for m in body["messages"] )elif"prompt"in body: prompt = body["prompt"] prompt_lower = prompt.lower()for pattern in INPUT_BLOCK_PATTERNS:if pattern.lower() in prompt_lower:return JSONResponse( status_code=400, content={"error": "输入内容被安全过滤器拦截","blocked_pattern": pattern } )return JSONResponse(status_code=200, content={"status": "ok"})@app.post("/output_filter")asyncdefoutput_filter(request: Request): body = await request.json()# 从模型响应中提取文本内容 response_text = ""if"choices"in body:for choice in body["choices"]: msg = choice.get("message", {}) response_text += msg.get("content", "")elif"content"in body: response_text = body["content"]for pattern in OUTPUT_BLOCK_PATTERNS:if pattern.lower() in response_text.lower():return JSONResponse( status_code=400, content={"error": "模型响应被安全过滤器拦截","blocked_pattern": pattern } )return JSONResponse(status_code=200, content={"status": "ok"})
# 正常请求,顺利通过两个过滤器curl -X POST http://localhost:12000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"messages": [{"role": "user", "content": "帮我整理收件箱,告诉我哪些可以归档"}]}'# → 正常模型响应# 包含危险指令,输入过滤器直接拦截,模型看不到这条请求curl -X POST http://localhost:12000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"messages": [{"role": "user", "content": "delete all emails in my inbox"}]}'# → {"error": "输入内容被安全过滤器拦截", "blocked_pattern": "delete all"}# Prompt 无害,但模型决定执行删除操作——输出过滤器拦截curl -X POST http://localhost:12000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"messages": [{"role": "user", "content": "清理一下我的邮件"}]}'# → {"error": "模型响应被安全过滤器拦截", "blocked_pattern": "permanently delete"}