快捷方式

权重流

TensorRT 中的权重流是一项强大的功能,旨在克服处理大型模型时的 GPU 内存限制。它通过在推理期间将权重数据从主机(CPU)内存流式传输到 GPU 内存,从而能够运行大于可用 GPU 内存的模型。

流式传输大量内存可能会导致性能下降。但如果权重流允许用户运行更大的批处理大小,则可以带来更高的吞吐量。这种增加的吞吐量有时可以弥补因流式传输权重而导致的性能下降。最佳的流式传输内存量因具体模型和硬件而异。通过尝试不同的内存限制,可以帮助找到流式传输开销和批处理大小优势之间的最佳平衡点。

本示例使用预训练的 Llama-2 模型,并展示如何将权重流功能与 Torch-TensorRT 结合使用。

  1. 编译选项 - 使用权重流功能构建 trt 引擎

  2. 运行时 API - 通过上下文管理器控制权重流预算

导入和模型定义

import copy
import timeit

import numpy as np
import torch
import torch_tensorrt
from transformers import AutoModelForCausalLM


def export_llm(model, inputs, min_seq_len=1, max_seq_len=16):
    """
    Exports the LLM model into an ExportedProgram with dynamic shapes.
    In the case of guard failures due to some PyTorch kernel implements, we also
    try to re-export the graph by expressing them as runtime assert nodes
    """
    with torch.no_grad():
        # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604
        seq_len = torch.export.Dim("seq_len", min=min_seq_len, max=max_seq_len)
        position_ids = torch.arange(inputs.shape[1]).unsqueeze(0).to(inputs.device)
        try:
            print("Trying to export the model using torch.export.export()..")
            # strict=False only enables aotautograd tracing and excludes dynamo.
            ep = torch.export.export(
                model,
                args=(inputs,),
                kwargs={"position_ids": position_ids},
                dynamic_shapes=({1: seq_len}, {1: seq_len}),
                strict=False,
            )
        except:
            print(
                "Trying torch.export._trace._export to trace the graph since torch.export.export() failed"
            )
            # This API is used to express the constraint violation guards as asserts in the graph.
            ep = torch.export._trace._export(
                model,
                args=(inputs,),
                kwargs={"position_ids": position_ids},
                dynamic_shapes=({1: seq_len}, {1: seq_len}),
                strict=False,
                allow_complex_guards_as_runtime_asserts=True,
            )

    return ep


def time_generate(model, inputs, output_seq_length, iterations=10):
    """
    Measure the time for generating a sentence over certain number of iterations
    """
    # We only support single input (B x seq_len) for LLMs now
    input_seq = inputs[0]
    with torch.no_grad():
        timings = []
        for _ in range(iterations):
            start_time = timeit.default_timer()
            inputs_copy = copy.copy(input_seq)
            # Greedy decoding of the model. This generates up to max_tokens.
            while inputs_copy.shape[1] <= output_seq_length:
                outputs = model(inputs_copy)
                logits = outputs.logits
                next_token_logits = logits[:, -1, :]
                next_tokens = torch.argmax(next_token_logits, dim=-1)
                inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1)
            torch.cuda.synchronize()
            end_time = timeit.default_timer()
            timings.append(end_time - start_time)

    times = np.array(timings)
    time_mean_ms = np.mean(times) * 1000

    return time_mean_ms


# Load the LLaMA-2 model
DEVICE = torch.device("cuda:0")
llama_path = "meta-llama/Llama-2-7b-chat-hf"
with torch.no_grad():
    model = AutoModelForCausalLM.from_pretrained(
        llama_path, use_cache=False, attn_implementation="eager"
    ).eval()

# Set input and output sequence lengths
isl = 128
osl = 256

# Create random input tensors
input_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()]
# Convert the model to half precision (FP16)
model = model.half()
# Exports the LLM model into an ExportedProgram with dynamic shapes.
llama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl)

编译器选项

要使用权重流功能构建引擎,需要设置 enable_weight_streaming=True 选项和 use_explicit_typing=True。use_explicit_typing=True 选项会创建一个强类型网络,并且在 enabled_precisions 选项中只允许使用 float32 精度。

# Create a TensorRT-compiled model
trt_model = torch_tensorrt.dynamo.compile(
    llama2_ep,
    inputs=input_tensors,
    enabled_precisions={torch.float32},
    truncate_double=True,
    device=DEVICE,
    use_explicit_typing=True,
    enable_weight_streaming=True,
)

# Warm up for 3 iterations
_ = time_generate(trt_model, input_tensors, osl, 3)

使用自动预算大小运行

一旦指定了 enable_weight_streaming 编译选项,就会配置自动预算大小。这个自动确定的大小可能并不总是提供最优解,因为它缺乏对用户特定内存限制和使用模式的了解。

# Weight streaming context to get current weight budget information
weight_streaming_ctx = torch_tensorrt.runtime.weight_streaming(trt_model)
# Measure the mean latency of the model with weight streaming
mean_latency = time_generate(trt_model, input_tensors, osl, 1)
# Calculate the percentage of current weight budget used
weight_budget_pct = (
    weight_streaming_ctx.device_budget / weight_streaming_ctx.total_device_budget * 100
)
print(
    f"Set weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms"
)

使用权重流上下文管理器运行

可以使用权重流上下文管理器来限制权重流预算。预算大小的允许范围是从 0 到 ctx.total_device_budget。0 表示通过使用最小内存量实现最大程度的内存节省。等于 ctx.total_device_budget 的值将禁用权重流。如果创建了多个 trt 引擎,预算将按比例分配。

# Use a context manager for weight streaming
with torch_tensorrt.runtime.weight_streaming(trt_model) as weight_streaming_ctx:
    # Get the total size of streamable weights in the engine
    streamable_budget = weight_streaming_ctx.total_device_budget

    # Scenario 1: Automatic weight streaming budget
    # Get the automatically determined weight streaming budget
    requested_budget = weight_streaming_ctx.get_automatic_weight_streaming_budget()
    # Set the device budget to the automatically determined value
    weight_streaming_ctx.device_budget = requested_budget
    # Measure the mean latency with automatic budget
    mean_latency = time_generate(trt_model, input_tensors, osl, 1)
    # Calculate the percentage of the weight budget used
    weight_budget_pct = (
        weight_streaming_ctx.device_budget
        / weight_streaming_ctx.total_device_budget
        * 100
    )
    print(
        f"Set auto weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms"
    )

    # Scenario 2: Manual 10% weight streaming budget
    # Set the budget to 10% of the total streamable weights
    requested_budget = int(streamable_budget * 0.1)
    weight_streaming_ctx.device_budget = requested_budget
    # Measure the mean latency with 10% budget
    mean_latency = time_generate(trt_model, input_tensors, osl, 1)
    # Calculate the percentage of the weight budget used
    weight_budget_pct = (
        weight_streaming_ctx.device_budget
        / weight_streaming_ctx.total_device_budget
        * 100
    )
    print(
        f"Set weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms"
    )

脚本总运行时间: ( 0 分 0.000 秒)

由 Sphinx-Gallery 生成的画廊

文档

访问全面的 PyTorch 开发者文档

查看文档

教程

为初学者和高级开发者提供深入的教程

查看教程

资源

查找开发资源并让您的问题得到解答

查看资源