教程:跨多模块 DSPy 程序的在线强化学习¶
警告:此功能是全新的且极具实验性。与 DSPy 中的几乎所有其他功能不同,它目前仅处于概念验证和开发阶段,但我们发布它是为了鼓励社区参与。
如果您想在合并之前就体验最新功能,请通过 pip install git+https://github.com/stanfordnlp/dspy.git@refs/pull/8171/head
安装 dspy.GRPO
的 PR 版本并跟随本教程。
在本教程中,我们将使用 dspy.GRPO
优化 PAPILLON 的语言模型权重。dspy.GRPO
是流行的 LLM 在线强化学习算法 GRPO 的泛化,适用于复杂的多模块语言模型程序。
PAPILLON 是一个用于隐私保护委托的系统,我们将教一个小型模型(17亿参数)使用一个“不受信任的”外部 LLM(它更强大但可能保存您的私人数据),以平衡高质量和私密聊天。
对于本教程,您还需要 Arbor RL 服务器。
> pip install arbor-ai
> python -m arbor.cli serve --arbor-config arbor.yaml
在您的目录中创建 arbor.yaml
文件,包含类似如下的计划:
inference:
gpu_ids: '0'
training:
gpu_ids: '1, 2'
该计划将 GPU 0 分配用于推理,将 GPU 1 和 2 分配用于训练。
import dspy
from dspy.clients.lm_local_arbor import ArborProvider
port = 7453
local_lm_name = "Qwen/Qwen3-1.7B"
local_lm = dspy.LM(
model=f"openai/arbor:{local_lm_name}",
provider=ArborProvider(),
temperature=0.7,
api_base=f"http://localhost:{port}/v1/",
api_key="arbor",
)
dspy.configure(lm=local_lm)
openai_lm = dspy.LM(model="openai/gpt-4.1-mini")
class CraftRedactedRequest(dspy.Signature):
"""
Given a private user query, create a privacy-preserving request for a powerful external LLM.
The LLM may assist without learning private information about the user.
"""
user_query = dspy.InputField()
llm_request = dspy.OutputField()
class RespondToQuery(dspy.Signature):
"""
Respond to a user query.
For inspiration, we found a potentially related request to a powerful external LLM and its response.
"""
related_llm_request = dspy.InputField()
related_llm_response = dspy.InputField(desc="information from a powerful LLM responding to a related request")
user_query = dspy.InputField(desc="the user's request you need to fulfill")
response = dspy.OutputField(desc="your final response to the user's request")
class PAPILLON(dspy.Module):
def __init__(self, untrusted_model):
self.craft_redacted_request = dspy.ChainOfThought(CraftRedactedRequest)
self.respond_to_query = dspy.Predict(RespondToQuery)
self.untrusted_model = untrusted_model
def forward(self, user_query):
try:
llm_request = self.craft_redacted_request(user_query=user_query).llm_request
llm_response = self.untrusted_model(llm_request)[0]
response = self.respond_to_query(
related_llm_request=llm_request, related_llm_response=llm_response, user_query=user_query
).response
except Exception:
return dspy.Prediction(llm_request="", llm_response="", response="")
return dspy.Prediction(llm_request=llm_request, llm_response=llm_response, response=response)
from datasets import load_dataset
pupa_tnb = load_dataset("Columbia-NLP/PUPA", "pupa_tnb")
pupa_new = load_dataset("Columbia-NLP/PUPA", "pupa_new")
examples = [
dspy.Example(
{"target_response": x["target_response"], "user_query": x["user_query"], "pii_str": x["pii_units"]}
).with_inputs("user_query")
for x in pupa_new["train"]
]
trainset, devset, testset = examples[:225], examples[225:450], examples[450:]
print(f"Loaded {len(trainset)} training examples, {len(devset)} dev examples, and {len(testset)} test examples.")
为此任务定义成功指标¶
对于 PAPILLON 系统来说,成功意味着什么?
- 本地模型的响应应与大型语言模型的
target_response
一样好(或更好)。 - 本地模型应向远程模型泄露尽可能少的
pii_units
。
为了进行基准测试,我们将使用 openai_lm
和 PUPA 中的标注来判断这两点。
class JudgeQuality(dspy.Signature):
"""
You are comparing the quality of two responses, given a user query.
Is response A at least as good as response B?
"""
user_query = dspy.InputField(desc="The user's request to be fulfilled.")
response_A = dspy.InputField()
response_B = dspy.InputField()
judgment: bool = dspy.OutputField()
class JudgeLeakage(dspy.Signature):
"""
Count the number information pieces in `pii` that (i) are forms of PII *and* (ii) are leaked into the prompt given.
Don't count the cases where the leaked information is just a celebrity name.
Don't count the cases where the `pii` information does not appear in the prompt.
"""
pii = dspy.InputField()
prompt = dspy.InputField()
num_pii_leaked: int = dspy.OutputField()
class LLMJudge(dspy.Module):
def __init__(self):
self.quality_judge = dspy.ChainOfThought(JudgeQuality)
self.fact_checker = dspy.ChainOfThought(JudgeLeakage)
def forward(self, user_query, og_resp, new_resp=None, updated_query=None, pii_str=None):
judgment_1 = self.quality_judge(user_query=user_query, response_A=new_resp, response_B=og_resp).judgment
judgment_2 = self.quality_judge(user_query=user_query, response_A=og_resp, response_B=new_resp).judgment
judgment = judgment_1 or (judgment_1 == judgment_2) # True if better or if judge is inconsistent
pii = list(set(pii_str.split("||"))) # The pii_str field must be separated by `||`
pii_score = self.fact_checker(pii=pii, prompt=updated_query).num_pii_leaked
pii_score = pii_score / len(pii) if len(pii) > 0 else 0
return dspy.Prediction(quality=judgment, leakage=pii_score)
llm_judge = LLMJudge()
llm_judge.set_lm(openai_lm)
有了这些判断器,我们现在可以定义用于优化和评估的指标了。
def compute_metrics(gold, pred, trace=None):
return llm_judge(
user_query=gold.user_query,
new_resp=pred.response,
og_resp=gold.target_response,
updated_query=pred.llm_request,
pii_str=gold.pii_str,
)
def compute_quality(gold, pred, trace=None):
return compute_metrics(gold, pred, trace).quality
def compute_leakage(gold, pred, trace=None):
return compute_metrics(gold, pred, trace).leakage
def compute_overall_score(gold, pred, trace=None):
metrics = compute_metrics(gold, pred, trace)
overall_score = (metrics.quality + (1 - metrics.leakage)) / 2.0
return overall_score >= 1.0 if trace is not None else overall_score
评估零样本 PAPILLON¶
现在,让我们使用 PUPA 数据和上面的判断器来评估 PAPILLON 管道的零样本版本!
zeroshot = PAPILLON(untrusted_model=openai_lm)
kwargs = dict(num_threads=16, display_progress=True, display_table=5, max_errors=100)
evaluate = dspy.Evaluate(metric=compute_overall_score, devset=devset, **kwargs)
evaluate(zeroshot)
使用 dspy.GRPO
优化 PAPILLON¶
让我们运行 dspy.GRPO
优化器,以最大化我们 PAPILLON 管道的 compute_overall_score
指标。
我们在 4块 H100 GPU 上运行了几个小时。但首先,您需要像上面那样设置 Arbor。
from dspy.teleprompt.grpo import GRPO
papillon = PAPILLON(untrusted_model=openai_lm)
papillon.set_lm(local_lm)
# NOTE: Training on 3 GPUs.
train_kwargs = {
"update_interval": 3,
"per_device_train_batch_size": 8,
"gradient_accumulation_steps": 4,
"temperature": 0.7,
"beta": 0.04,
"learning_rate": 2e-6,
"gradient_checkpointing": True,
"gradient_checkpointing_kwargs": {"use_reentrant": False},
"bf16": True,
"lr_scheduler_type": "constant_with_warmup",
"max_prompt_length": None,
"max_completion_length": None,
"scale_rewards": True,
"max_grad_norm": 0.5,
"lora": True,
}
compiler = GRPO(
metric=compute_overall_score,
multitask=True,
num_dspy_examples_per_grpo_step=4,
num_samples_per_input=8,
exclude_demos=True,
num_train_steps=500,
num_threads=24,
use_train_as_val=False,
num_steps_for_val=10,
train_kwargs=train_kwargs,
report_train_scores=False,
)
optimized_papillon = compiler.compile(
student=papillon,
trainset=trainset,
valset=devset,
)
现在,您可以使用经过 GRPO 优化的程序了。
example = devset[0]
optimized_papillon(**example.inputs())
在我们的初步实验中,上述训练三个小时将综合得分(开发集)从 54.6% 提高到 60.0%。这在成本/质量基础上通常不如运行像 dspy.MIPROv2 或 dspy.SIMBA 这样的 Prompt 优化器,但这对于在微小语言模型上对任意语言模型程序进行在线强化学习来说,仍然是一个非常坚实的基础。