Master test-time compute and chain-of-thought reasoning techniques for LLMs. Learn how to effectively use "thinking time" to improve model performance through parallel sampling, sequential revision, and process reward models.
Test-time compute (TTC) and Chain-of-Thought (CoT) have led to significant improvements in LLM performance. The core idea is enabling models to "think" longer before producing final answers, similar to human System 2 thinking.
def cot_prompt(problem):
"""Generate chain-of-thought prompt for math problems"""
return f"""Solve this step by step:
Problem: {problem}
Let's think step by step:
"""
# Usage
problem = "What is 12345 times 56789?"
prompt = cot_prompt(problem)
import random
def best_of_n_sampling(model, prompt, n=5, scorer=None):
"""Generate N samples and return the highest scoring one"""
samples = []
for _ in range(n):
sample = model.generate(prompt, temperature=random.uniform(0.7, 1.2))
score = scorer(sample) if scorer else len(sample) # Simple length-based scoring
samples.append((sample, score))
return max(samples, key=lambda x: x[1])[0]
def beam_search_with_prm(model, prm_model, prompt, beam_width=5, max_steps=10):
"""Beam search guided by process reward model"""
beams = [(prompt, 0.0)] # (sequence, cumulative_reward)
for step in range(max_steps):
candidates = []
for seq, reward in beams:
# Generate next tokens
next_tokens = model.generate_next_tokens(seq, top_k=beam_width)
for token in next_tokens:
new_seq = seq + token
# Get process reward for this step
step_reward = prm_model.evaluate(new_seq)
candidates.append((new_seq, reward + step_reward))
# Keep top beam_width candidates
beams = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_width]
return beams[0][0] # Return highest reward sequence
共 1 个版本