class Layer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(10, 10)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(10, 10)
        self.relu2 = torch.nn.ReLU()

    def forward(self, x):
        a = self.linear1(x)
        a = self.relu1(a)
        a = torch.sigmoid(a)
        b = self.linear2(a)
        b = self.relu2(b)
        return b


class Model(torch.nn.Module):
    def __init__(self, apply_regional_compilation):
        super().__init__()
        self.linear = torch.nn.Linear(10, 10)
        # Apply compile only to the repeated layers.
        if apply_regional_compilation:
            self.layers = torch.nn.ModuleList(
                [torch.compile(Layer()) for _ in range(64)]
            )
        else:
            self.layers = torch.nn.ModuleList([Layer() for _ in range(64)])

    def forward(self, x):
        # In regional compilation, the self.linear is outside of the scope of `torch.compile`.
        x = self.linear(x)
        for layer in self.layers:
            x = layer(x)
        return x

接下来，让我们回顾一下完整模型与区域编译之间的区别。

在完整模型编译中，整个模型被作为一个整体进行编译。这是大多数用户使用 torch.compile 的常用方法。在此示例中，我们将 torch.compile 应用于 Model 对象。这实际上会将 64 个层内联，生成一个大的图进行编译。您可以通过运行此教程并设置 TORCH_LOGS=graph_code 来查看完整的图。

model = Model(apply_regional_compilation=False).cuda()
full_compiled_model = torch.compile(model)

另一方面，区域编译编译模型的一个区域。通过有策略地选择编译模型的重复区域，我们可以编译一个更小的图，然后将编译后的图重用于所有区域。在此示例中，torch.compile 仅应用于 layers，而不是整个模型。

regional_compiled_model = Model(apply_regional_compilation=True).cuda()

将编译应用于重复区域而不是完整模型，可以大大节省编译时间。在这里，我们将只编译一个层实例，然后在 Model 对象中重用它 64 次。

请注意，对于重复区域，模型的某些部分可能不会被编译。例如，Model 中的 self.linear 超出了区域编译的范围。

另外，请注意性能加速与编译时间之间存在权衡。完整模型编译涉及更大的图，理论上提供了更多的优化空间。然而，就实际而言，并且取决于模型，我们观察到许多情况下完整模型与区域编译之间的速度提升差异很小。

接下来，让我们测量完整模型与区域编译的编译时间。

torch.compile 是一个 JIT 编译器，这意味着它在第一次调用时进行编译。在下面的代码中，我们测量了第一次调用所花费的总时间。虽然这种方法不精确，但它提供了良好的估计，因为大部分时间都花在编译上。

def measure_latency(fn, input):
    # Reset the compiler caches to ensure no reuse between different runs
    torch.compiler.reset()
    with torch._inductor.utils.fresh_inductor_cache():
        start = perf_counter()
        fn(input)
        torch.cuda.synchronize()
        end = perf_counter()
        return end - start


input = torch.randn(10, 10, device="cuda")
full_model_compilation_latency = measure_latency(full_compiled_model, input)
print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds")

regional_compilation_latency = measure_latency(regional_compiled_model, input)
print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")

assert regional_compilation_latency < full_model_compilation_latency

/usr/local/lib/python3.10/dist-packages/torch/backends/cuda/__init__.py:131: UserWarning:

Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.ac.cn/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py:312: UserWarning:

TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.

Full model compilation time = 11.40 seconds
Regional compilation time = 0.87 seconds

结论#

本教程展示了如何控制模型的冷启动编译时间，如果模型包含重复区域。这种方法需要用户进行修改，将 torch.compile 应用于重复区域，而不是更常用的完整模型编译。我们一直在努力减少冷启动编译时间。

脚本总运行时间： (0 分钟 13.798 秒)

通过区域编译减少 torch.compile 的冷启动编译时间#

先决条件#

设置#

步骤#

结论#

文档

教程

资源