Is the .backward() context basically is similar to the tracer.invoke() forward pass.? I am trying to perform attribution patching and to cache the gradients but it keeps giving me
ValueError: Execution complete but 6312352464.grad was not provided. Did you call an Envoy out of order? Investigate why this module was not called.
with model.trace() as tracer:
with tracer.invoke(prompts) as invoker_clean:
for layer in model.transformer.h:
attn_out = layer.attn.c_proj.input
clean_out.append(attn_out.save())
with tracer.invoke(prompts) as invoker_corrupt:
for layer in model.transformer.h:
attn_out = layer.attn.c_proj.input.save()
corrupted_out.append(attn_out.save()) # make sure gradient can flow
logits = model.lm_head.output.save()
value = ioi_metric(logits)
# value.sum().backward(retain_graph=True)
with value.sum().backward():
value.grad.save()
for layer in reversed(model.transformer.h):
attn_back = layer.attn.c_proj.input
with attn_back.sum().backward():
corrupted_grads.append(attn_back.grad[:].save())```