TensorRT-LLM 0.5.0 源码之二十四

cudaUtils

// workspace for cublas gemm : 32MB#define CUBLAS_WORKSPACE_SIZE 33554432typedef struct __align__(4){    half x, y, z, w;} half4;

这段代码定义了一个名为 half4 的数据类型，它是一个包含四个 half 类型分量且整体按4字节对齐的结构体。下面我们来详细解析它的各个部分。

结构体成员

• 结构体包含四个成员：x, y, z, w，它们都是 half 类型。
• half 类型：这是一种半精度浮点数，通常占用 16 位（2 字节） 存储空间。它的数值范围和精度低于标准的单精度浮点数（float），但能节省内存和带宽，在对精度要求不极端的场景下非常有用，常见于图形编程和移动设备上的着色器。
• 因此，这个结构体四个成员的理论总大小是 4 × 2 字节 = 8 字节。

内存对齐说明

• __align__(4)：这个关键字（或类似语法，如 __attribute__((aligned(4))) 或 _Alignas(4)，取决于编译器）指示编译器将此结构体实例在内存中的起始地址安排在 4 字节的整数倍 上。
• 对齐的重要性：现代计算机的CPU对内存的读写操作在数据位于特定地址边界（通常是数据大小的整数倍）时效率最高。如果数据没有正确对齐，在某些架构（如ARM）上可能导致性能下降甚至运行错误。

结构体大小与内存布局

虽然四个 half 成员本身只占 8 字节，但 __align__(4) 的对齐要求会影响整个结构体的大小和布局。

• 结构体的最终大小通常会是其成员中最大对齐要求的整数倍，这主要是为了确保在定义结构体数组时，每一个元素都能正确对齐。
• 在这个例子中，由于每个 half 的大小是2字节，并且指定了4字节对齐，编译器可能会在成员之间或结构体末尾添加填充字节（Padding）来满足对齐要求。最终 sizeof(half4) 很可能仍然是 8 字节，因为 8 字节已经是 4 字节的整数倍。但在更复杂的结构体中，填充字节可能会更明显。

主要用途

• 数据封装：将四个相关的半精度浮点数（例如，表示一个颜色RGBA或一个四维向量）逻辑上捆绑在一起，方便统一管理。
• 性能优化：通过指定对齐方式，可以确保在特定硬件（如某些DSP或GPU）上获得高效的内存访问速度。
• 空间与精度平衡：使用 half 类型在保证一定精度的同时，相比使用四个 float（共16字节）能显著节省存储空间和传输带宽，这在处理大量数据（如顶点数据、纹理）时尤为重要。

注意事项

• 平台差异：half 类型的精度和支持程度可能因平台和编译器而异。在一些PC平台上，它可能仍被当作 float 处理。
• 语法变体：__align__(4) 是编译器相关的扩展语法。在C11标准中，更便携的关键字是 _Alignas 或头文件 stdalign.h 中定义的 alignas。

/* **************************** type definition ***************************** */enum CublasDataType{    FLOAT_DATATYPE = 0,    HALF_DATATYPE = 1,    BFLOAT16_DATATYPE = 2,    INT8_DATATYPE = 3,    FP8_DATATYPE = 4};enum TRTLLMCudaDataType{    FP32 = 0,    FP16 = 1,    BF16 = 2,    INT8 = 3,    FP8 = 4};enum class OperationType{    FP32,    FP16,    BF16,    INT8,    FP8};

/* **************************** debug tools ********************************* */static const char* _cudaGetErrorEnum(cudaError_t error){    return cudaGetErrorString(error);}static const char* _cudaGetErrorEnum(cublasStatus_t error){    switch (error)    {    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";    case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";    case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";    }    return "<unknown>";}template <typename T>void check(T result, char const* const func, const char* const file, int const line){    if (result)    {        throw TllmException(            file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result)));    }}#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)#define check_cuda_error_2(val, file, line) check((val), #val, file, line)inline bool isCudaLaunchBlocking(){    static bool firstCall = true;    static bool result = false;    if (firstCall)    {        const char* env = std::getenv("CUDA_LAUNCH_BLOCKING");        result = env != nullptr && std::string(env) == "1";        firstCall = false;    }    return result;}inline void syncAndCheck(const char* const file, int const line){#ifndef NDEBUG    const bool checkError = true;#else    const bool checkError = isCudaLaunchBlocking();#endif    if (checkError)    {        cudaError_t result = cudaDeviceSynchronize();        check(result, "cudaDeviceSynchronize", file, line);    }}#define sync_check_cuda_error() tensorrt_llm::common::syncAndCheck(__FILE__, __LINE__)

/* * Macros compliant with TensorRT coding conventions */#define TLLM_CUDA_CHECK(stat)                                                                                          \    do                                                                                                                 \    {                                                                                                                  \        tensorrt_llm::common::check((stat), #stat, __FILE__, __LINE__);                                                \    } while (0)

#define PRINT_FUNC_NAME_()                                                                                             \    do                                                                                                                 \    {                                                                                                                  \        std::cout << "[TensorRT-LLM][CALL] " << __FUNCTION__ << " " << std::endl;                                      \    } while (0)

template<typename T> struct packed_type;template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by defaulttemplate <>          struct packed_type<half>          { using type = half2; };#ifdef ENABLE_BF16template<>struct packed_type<__nv_bfloat16> {    using type = __nv_bfloat162;};#endif#ifdef ENABLE_FP8template<>struct packed_type<__nv_fp8_e4m3> {    using type = __nv_fp8x2_e4m3;};#endif

template<typename T> struct num_elems;template <>          struct num_elems<float>           { static constexpr int value = 1; };template <>          struct num_elems<float2>          { static constexpr int value = 2; };template <>          struct num_elems<float4>          { static constexpr int value = 4; };template <>          struct num_elems<half>            { static constexpr int value = 1; };template <>          struct num_elems<half2>           { static constexpr int value = 2; };#ifdef ENABLE_BF16template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };#endif#ifdef ENABLE_FP8template <>          struct num_elems<__nv_fp8_e4m3>   { static constexpr int value = 1; };template <>          struct num_elems<__nv_fp8x2_e4m3>  { static constexpr int value = 2; };#endif

template<typename T, int num> struct packed_as;template<typename T>          struct packed_as<T, 1>              { using type = T; };template<>                    struct packed_as<half,  2>          { using type = half2; };template<>                    struct packed_as<float,  2>         { using type = float2; };template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };template<>                    struct packed_as<int32_t, 2>        { using type = int2; };template<>                    struct packed_as<half2, 1>          { using type = half; };template<>                    struct packed_as<float2, 1>         { using type = float; };#ifdef ENABLE_BF16template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };#endif#ifdef ENABLE_FP8template<> struct packed_as<__nv_fp8_e4m3,  2> { using type = __nv_fp8x2_e4m3; };template<> struct packed_as<__nv_fp8x2_e4m3, 1> { using type = __nv_fp8_e4m3;  };template<> struct packed_as<__nv_fp8_e5m2,  2> { using type = __nv_fp8x2_e5m2; };template<> struct packed_as<__nv_fp8x2_e5m2, 1> { using type = __nv_fp8_e5m2;  };#endif

inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }inline __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); }inline __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }inline __device__ float2 operator+(float2 a, float  b) { return make_float2(a.x + b, a.y + b); }inline __device__ float2 operator-(float2 a, float  b) { return make_float2(a.x - b, a.y - b); }

template <typename T>struct CudaDataType{};template <>struct CudaDataType<float>{    static constexpr cudaDataType_t value = cudaDataType::CUDA_R_32F;};template <>struct CudaDataType<half>{    static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16F;};#ifdef ENABLE_BF16template <>struct CudaDataType<__nv_bfloat16>{    static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16BF;};#endif

inline int getSMVersion(){    int device{-1};    check_cuda_error(cudaGetDevice(&device));    int sm_major = 0;    int sm_minor = 0;    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));    return sm_major * 10 + sm_minor;}

• 函数签名inline int getSMVersion()：inline关键字建议编译器将函数体直接展开在调用处以避免函数调用开销，适用于短小且频繁使用的函数。返回值是一个整数，表示计算能力（Compute Capability），格式为主版本号 × 10 + 次版本号。例如，若设备计算能力为7.5，则返回75
• 用途：计算能力版本决定了GPU支持的硬件功能（如线程块大小、内存层次等），常用于条件编译或运行时优化（例如，为不同架构选择不同的内核代码）

inline int getDevice(){    int current_dev_id = 0;    check_cuda_error(cudaGetDevice(&current_dev_id));    return current_dev_id;}

inline int getDeviceCount(){    int count = 0;    check_cuda_error(cudaGetDeviceCount(&count));    return count;}

/// Get the memory info/// \return The free and total amount of memory in bytesinline std::tuple<size_t, size_t> getDeviceMemoryInfo(){    size_t free, total;    check_cuda_error(cudaMemGetInfo(&free, &total));    return {free, total};}

inline int getMultiProcessorCount(){    int device_id;    int multi_processor_count;    check_cuda_error(cudaGetDevice(&device_id));    check_cuda_error(cudaDeviceGetAttribute(&multi_processor_count, cudaDevAttrMultiProcessorCount, device_id));    return multi_processor_count;}

inline int divUp(int a, int n){    return (a + n - 1) / n;}

template <typename T, typename U, typename = std::enable_if_t<std::is_integral<T>::value>,    typename = std::enable_if_t<std::is_integral<U>::value>>auto constexpr ceilDiv(T numerator, U denominator){    return (numerator + denominator - 1) / denominator;}

这段C++代码定义了一个名为ceilDiv的函数模板，用于执行向上取整的整数除法。它通过模板元编程技术对参数类型进行了约束，并确保函数能在编译期计算。下面我们来详细解析它的各个组成部分。

模板参数与类型约束

代码中的模板参数列表使用了std::enable_if_t来施加类型约束，这是实现SFINAE（Substitution Failure Is Not An Error）的常见手法。

template <typename T, typename U,          typename = std::enable_if_t<std::is_integral<T>::value>,          typename = std::enable_if_t<std::is_integral<U>::value>>

• std::is_integral<T>::value：这是一个类型特质（type trait），在编译期检查类型T是否为整型（如int, long, char等）。如果是，其value成员为true；否则为false。
• std::enable_if_t<Condition>：它是std::enable_if<Condition>::type的别名模板（C++14引入）。当Condition为true时，std::enable_if_t代表一个有效的类型（默认为void）；当Condition为false时，它不产生任何类型，导致模板参数替换失败。
• SFINAE效果：这两个默认模板参数共同作用，确保了编译器只有在T和U都是整型时，才会选择实例化这个模板函数。如果传入浮点数等非整型参数，编译将会失败。

函数声明与实现

auto constexpr ceilDiv(T numerator, U denominator){    return (numerator + denominator - 1) / denominator;}

• auto 返回类型：使用auto让编译器自动推导返回类型。返回类型将是表达式(numerator + denominator - 1) / denominator的结果类型，通常是T和U在经过算术转换后的公共整数类型。
• constexpr 函数说明符：这表明该函数是一个常量表达式函数。当传入的参数是编译期常量时，整个计算过程可以在编译期完成，结果直接作为常量嵌入代码中，有助于提升运行时效率。例如，ceilDiv(10, 3)会在编译期直接计算出结果4。
• 向上取整的算法：公式 (numerator + denominator - 1) / denominator 是整数运算中实现向上取整的经典技巧。

• 当 numerator 能被 denominator 整除时，例如 numerator=10, denominator=5：(10 + 5 - 1) / 5 = 14 / 5 = 2，结果正确。
• 当不能整除时，例如 numerator=10, denominator=3：(10 + 3 - 1) / 3 = 12 / 3 = 4，而 10 / 3 的整数除法结果是 3，向上取整正好是 4。

应用场景与注意事项

这种向上取整的除法在计算分页数量、数组块分配等场景中非常有用。使用此函数模板时需要注意：

• 它只接受整型参数，这是通过模板约束强制实现的。
• 函数被设计为constexpr，适合用于编译期需要常量表达式的上下文，如数组大小定义、模板非类型参数等。
• 需要注意**分母为零（zero denominator）**的情况，因为这会引发未定义行为。在实际使用中，应确保denominator不为零。

总结

这个ceilDiv函数模板巧妙地结合了模板元编程（通过std::enable_if_t和std::is_integral进行类型约束）、常量表达式计算（constexpr）和高效的算术算法，提供了一个类型安全且性能优越的向上取整除法工具。

template <typename T>void printAbsMean(const T* buf, uint64_t size, cudaStream_t stream, std::string name = ""){    if (buf == nullptr)    {        TLLM_LOG_WARNING("%s is an nullptr, skip!", name.c_str());        return;    }    cudaDeviceSynchronize();    check_cuda_error(cudaGetLastError());    T* h_tmp = new T[size];    cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);    cudaDeviceSynchronize();    check_cuda_error(cudaGetLastError());    double sum = 0.0f;    uint64_t zero_count = 0;    float max_val = -1e10;    bool find_inf = false;    for (uint64_t i = 0; i < size; i++)    {        if (std::isinf((float) (h_tmp[i])))        {            find_inf = true;            continue;        }        sum += abs((double) h_tmp[i]);        if ((float) h_tmp[i] == 0.0f)        {            zero_count++;        }        max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));    }    TLLM_LOG_INFO("%20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s", name.c_str(), size, sum / size,        sum, max_val, find_inf ? "true" : "false");    delete[] h_tmp;    cudaDeviceSynchronize();    check_cuda_error(cudaGetLastError());}

template <typename T>void printToStream(const T* result, const int size, FILE* strm){    const bool split_rows = (strm == stdout);    if (result == nullptr)    {        TLLM_LOG_WARNING("It is an nullptr, skip! \n");        return;    }    T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));    check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));    for (int i = 0; i < size; ++i)    {        fprintf(strm, "%f, ", static_cast<float>(tmp[i]));        if (split_rows && ((i + 1) % 10) == 0)            fprintf(strm, "\n");    }    if (!split_rows || (size % 10) != 0)    {        fprintf(strm, "\n");    }    free(tmp);}template <typename T>void printToScreen(const T* result, const int size){    printToStream(result, size, stdout);}template <typename T>void print2dToStream(const T* result, const int r, const int c, const int stride, FILE* strm){    if (result == nullptr)    {        TLLM_LOG_WARNING("It is an nullptr, skip! \n");        return;    }    for (int ri = 0; ri < r; ++ri)    {        const T* ptr = result + ri * stride;        printToStream(ptr, c, strm);    }    fprintf(strm, "\n");}template <typename T>void print2dToScreen(const T* result, const int r, const int c, const int stride){    print2dToStream(result, r, c, stride, stdout);}

inline void print_float_(float x){    printf("%7.3f ", x);}inline void print_element_(float x){    print_float_(x);}inline void print_element_(half x){    print_float_((float) x);}#ifdef ENABLE_BF16inline void print_element_(__nv_bfloat16 x){    print_float_((float) x);}#endifinline void print_element_(uint32_t ul){    printf("%7" PRIu32, ul);}inline void print_element_(uint64_t ull){    printf("%7" PRIu64, ull);}inline void print_element_(int32_t il){    printf("%7" PRId32, il);}inline void print_element_(int64_t ill){    printf("%7" PRId64, ill);}template <typename T>inline void printMatrix(const T* ptr, int m, int k, int stride, bool is_device_ptr){    T* tmp;    if (is_device_ptr)    {        // k < stride ; stride = col-dimension.        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));        cudaDeviceSynchronize();    }    else    {        tmp = const_cast<T*>(ptr);    }    for (int ii = -1; ii < m; ++ii)    {        if (ii >= 0)        {            printf("%07d ", ii);        }        else        {            printf("   ");        }        for (int jj = 0; jj < k; jj += 1)        {            if (ii >= 0)            {                print_element_(tmp[ii * stride + jj]);            }            else            {                printf("%7d ", jj);            }        }        printf("\n");    }    if (is_device_ptr)    {        free(tmp);    }}template void printMatrix(const float* ptr, int m, int k, int stride, bool is_device_ptr);template void printMatrix(const half* ptr, int m, int k, int stride, bool is_device_ptr);#ifdef ENABLE_BF16template void printMatrix(const __nv_bfloat16* ptr, int m, int k, int stride, bool is_device_ptr);#endiftemplate void printMatrix(const uint32_t* ptr, int m, int k, int stride, bool is_device_ptr);template void printMatrix(const uint64_t* ptr, int m, int k, int stride, bool is_device_ptr);template void printMatrix(const int* ptr, int m, int k, int stride, bool is_device_ptr);

代码 printf("%7" PRId64, ill) 是C/C++语言中一种用于格式化输出64位有符号整数的跨平台写法。下面我来详细解释它的各个部分。

代码含义详解

1. int64_t 类型代码中的变量 ill 应该是 int64_t 类型（从格式符推断）。这是在 stdint.h（C语言）或 cstdint（C++）头文件中定义的固定宽度整数类型，保证在所有平台上都恰好占用64位（8字节）。使用这种类型可以确保代码在不同平台上有明确的数据长度。
2. 格式说明符 %7" PRId64这是整个格式字符串的核心，由两部分拼接而成：

• %7：这是一个字段宽度说明符。数字 7 指定了输出值的最小宽度为7个字符。如果实际数值的位数少于7位，输出会在左侧（默认右对齐）用空格填充以达到指定宽度；如果数值位数超过7位，则会按实际宽度完整输出。
• PRId64：这是一个在 inttypes.h（C语言）或 cinttypes（C++）中定义的宏。它的主要作用是提供一个与 int64_t 类型匹配的格式化符号（如 d），并确保跨平台兼容性。在预编译阶段，PRId64 会根据当前编译环境被展开为特定的字符串。例如，在64位Linux系统上，它通常被定义为 "ld"；而在32位系统或Windows环境下，则可能被定义为 "lld" 或 "I64d"。

3. 字符串字面量的拼接C/C++语法允许将多个用空格分隔的字符串字面量自动连接成一个字符串。因此，"%7" 和宏展开后的 PRId64 会在编译前合并。例如，在64位Linux系统上，"%7" PRId64 最终会组合成字符串 "%7ld"。
4. 整行代码的效果所以，printf("%7" PRId64, ill) 这行代码的最终目的是：以至少7个字符的宽度，右对齐打印一个64位有符号整数 ill。

使用注意事项与最佳实践

1. 包含正确的头文件为了使用 PRId64 和 int64_t，你需要在源代码中包含相应的头文件。

• 在C语言中：#include <inttypes.h> 和 #include <stdint.h>。
• 在C++中：推荐使用 #include <cinttypes> 和 #include <cstdint>。

2. C++中的特殊处理在某些C编译标准下（如C11之前），使用 PRId64 可能需要先定义宏 __STDC_FORMAT_MACROS 以启用这些格式宏。最稳妥的做法是：

• 使用C++11或更新标准进行编译。

• 或者在包含头文件之前定义该宏：

#define __STDC_FORMAT_MACROS#include <cinttypes>// 或者 #include <inttypes.h>

3. 正确的参数类型确保传递给 printf 的变量 ill 的类型确实是 int64_t（或等价的 long long, long，这取决于平台和typedef）。类型不匹配可能导致未定义行为。

一个简单的例子

#include <stdio.h>#include <stdint.h>   // 提供 int64_t 的定义#include <inttypes.h> // 提供 PRId64 的定义int main() {    int64_t my_number = -12345;    printf("The value is: '%7" PRId64 "'\n", my_number);    return 0;}

假设在64位系统上，PRId64 展开为 "ld"，输出可能类似于：

The value is: '  -12345'

（注意数字 -12345 连负号共6个字符，因此在左侧填充了一个空格以达到总共7个字符的宽度。）

cudaAllocator

#include <cuda_runtime.h>class CudaAllocator : public IAllocator{public:explicit CudaAllocator(runtime::BufferManager bufferManager);    ~CudaAllocator() override = default;void free(void** ptr) override;protected:bool contains(void const* ptr) const override{        return mPointerMapping.find(ptr) != mPointerMapping.end();    }    ReallocType reallocType(void const* ptr, size_t size) const override;void* malloc(size_t size, bool setZero) override;void memSet(void* ptr, int val, size_t size) override;private:    runtime::BufferManager mBufferManager;    std::unordered_map<void const*, runtime::BufferManager::IBufferPtr> mPointerMapping{};};

using namespace tensorrt_llm::common;namespace tr = tensorrt_llm::runtime;CudaAllocator::CudaAllocator(tr::BufferManager bufferManager)    : mBufferManager(std::move(bufferManager)){}

ReallocType CudaAllocator::reallocType(void const* ptr, size_t size) const{    TLLM_CHECK(contains(ptr));    auto const currentSize = mPointerMapping.at(ptr)->getSize();    TLLM_LOG_DEBUG("current_buffer_size: %d, original buffer: %p, new buffer: %d", currentSize, ptr, size);    if (currentSize < size)    {        return ReallocType::INCREASE;    }    else if (currentSize == size)    {        return ReallocType::REUSE;    }    else    {        return ReallocType::DECREASE;    }}

void* CudaAllocator::malloc(std::size_t size, bool const setZero){    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);    auto bufferPtr = mBufferManager.gpu(size);    if (setZero)    {        mBufferManager.setZero(*bufferPtr);    }    void* ptr{bufferPtr->data()};    TLLM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);    mPointerMapping.insert({ptr, std::move(bufferPtr)});    return ptr;}void CudaAllocator::free(void** ptr){    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);    mPointerMapping.erase(*ptr);    *ptr = nullptr;}void CudaAllocator::memSet(void* ptr, int const val, size_t const size){    check_cuda_error(cudaMemsetAsync(ptr, val, size, mBufferManager.getStream().get()));}

参考文献

• https://github.com/NVIDIA/TensorRT-LLM/blob/release/0.5.0/cpp/tensorrt_llm/common/cudaAllocator.h
• https://github.com/NVIDIA/TensorRT-LLM/blob/release/0.5.0/cpp/tensorrt_llm/common/cudaUtils.h

点个「赞」+「在看」❤️

让我们知道这份文字有温暖到你，也是我们持续创作的最大动力！