```PyTorch 自定义算子```

原文

Introduction

Using PyTorch custom operations is common in PyTorch models. PyTorch custom operations can be custom classes and custom functions implemented in C++ and CUDA and used in both Python and C++ inference programs.

In this blog post, I would like to share how to implement PyTorch custom operations in C++ and CUDA, and how to use them in PyTorch models and AOTInductor compiled inference programs, using a simple identity convolution example.

PyTorch Custom Function

PyTorch custom functions can be implemented in C++ and CUDA and registered using the TORCH_LIBRARY_IMPL macro. Both the CPU and CUDA implementations can be provided, and PyTorch will dispatch to the correct implementation based on the device of the input tensors.

custom_ops.cpp




torch::Tensor identity_conv_cpu_impl(const torch::Tensor& input)
{
    TORCH_CHECK(!input.is_cuda(),
                "identity_conv_cpu_impl: input must be a CPU tensor");
    return input.clone();
}




torch::Tensor identity_conv_cuda_impl(const torch::Tensor& input)
{
    TORCH_CHECK(input.is_cuda(),
                "identity_conv_cuda_impl: input must be a CUDA tensor");

    
    auto output = torch::empty_like(input);
    const int64_t numel = input.numel();

    if (numel == 0)
        return output;

    
    const int ndim = input.dim();
    const auto opts =
        torch::TensorOptions().dtype(torch::kInt64).device(input.device());
    const auto shape_dev = torch::tensor(
        std::vector<int64_t>(input.sizes().begin(), input.sizes().end()), opts);
    const auto strides_dev = torch::tensor(
        std::vector<int64_t>(input.strides().begin(), input.strides().end()),
        opts);

    constexpr int kThreads = 256;
    const int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);

    AT_DISPATCH_FLOATING_TYPES_AND2(
        at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(),
        "identity_conv_cuda_impl",
        [&]()
        {
            identity_kernel<scalar_t><<<blocks, kThreads>>>(
                input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                shape_dev.data_ptr<int64_t>(), strides_dev.data_ptr<int64_t>(),
                ndim, numel);
        });

    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

custom_op_registration.cpp


TORCH_LIBRARY_IMPL(my_ops, CUDA, m)
{
    m.impl("identity_conv_op", identity_conv_cuda_impl);
}


TORCH_LIBRARY_IMPL(my_ops, CPU, m)
{
    m.impl("identity_conv_op", identity_conv_cpu_impl);
}

PyTorch Custom Class

PyTorch custom functions are stateless and cannot hold any parameters. If we would like to implement a custom class that holds some parameters and has a forward() method that can be called from Python, we can use torch::CustomClassHolder to define a custom class in C++ and register it with TORCH_LIBRARY macro.

custom_class.cpp












struct IdentityConvClass : torch::CustomClassHolder
{
    int64_t channels_;

    explicit IdentityConvClass(int64_t channels) : channels_(channels) {}

    torch::Tensor forward(const torch::Tensor& x)
    {
        return x.is_cuda() ? identity_conv_cuda_impl(x)
                           : identity_conv_cpu_impl(x);
    }

    int64_t get_channels() const { return channels_; }
};

custom_class_registration.cpp








TORCH_LIBRARY(my_ops, m)
{
    
    
    m.class_<IdentityConvClass>("IdentityConvClass")
        .def(torch::init<int64_t>())
        .def("forward", &IdentityConvClass::forward)
        .def("get_channels", &IdentityConvClass::get_channels)
        
        
        
        
        
        
        .def("__obj_flatten__",
             [](const c10::intrusive_ptr<IdentityConvClass>& self)
             {
                 return std::make_tuple(
                     std::make_tuple(std::string("channels"), self->channels_));
             })
        
        .def_pickle(
            [](const c10::intrusive_ptr<IdentityConvClass>& self) -> int64_t
            { return self->channels_; },
            [](int64_t channels) -> c10::intrusive_ptr<IdentityConvClass>
            { return c10::make_intrusive<IdentityConvClass>(channels); });

    
    m.def("identity_conv_op(Tensor x) -> Tensor");
}

Using Custom Operations and Classes In PyTorch

The PyTorch custom classes, functions, and their registrations in C++ are built into a shared library (libidentity_conv_ops.so) that can be loaded and registered in PyTorch using torch.ops.load_library. For torch.compile and torch.export compatibility, we also need to register “fake” (abstract) versions of the custom classes and functions in PyTorch using @register_fake_class and @torch.library.register_fake so that the FakeTensor-based symbolic tracing can work correctly without having to execute the actual C++/CUDA code during tracing.

custom_ops.py

"""
custom_ops.py
=============
Loads the C++ / CUDA shared library and sets up all custom PyTorch operations
used by the IdentityModel:

  1. torch.classes.my_ops.IdentityConvClass  (registered by the shared library)
     - A fake/abstract version is registered here so that torch.export can
       trace through module attributes that hold an instance of this class.

  2. my_ops::identity_conv_op  (schema + CPU + CUDA registered by the shared library)
     - register_fake: abstract implementation for torch.export / FakeTensor.
"""

import os

import torch
import torch.library











_default_lib = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "..", "ext",
    "libidentity_conv_ops.so")
_lib_path = os.path.abspath(
    os.environ.get("IDENTITY_CONV_OPS_LIB", _default_lib))
torch.ops.load_library(_lib_path)











from torch._library.fake_class_registry import register_fake_class


@register_fake_class("my_ops::IdentityConvClass")
class FakeIdentityConvClass:
    """Abstract counterpart of IdentityConvClass used during torch.export."""

    def __init__(self, channels: int) -> None:
        self.channels_ = channels

    
    def __obj_flatten__(self):
        
        
        return (("channels", self.channels_), )

    @classmethod
    def __obj_unflatten__(cls, flat):
        
        
        return cls(dict(flat)["channels"])

    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        return torch.empty_like(x)

    def get_channels(self) -> int:
        return self.channels_











@torch.library.register_fake("my_ops::identity_conv_op")
def _identity_conv_op_fake(x: torch.Tensor) -> torch.Tensor:
    """Abstract implementation used by torch.export / FakeTensor tracing."""
    return torch.empty_like(x)



identity_conv_op = torch.ops.my_ops.identity_conv_op

PyTorch custom classes can be loaded using torch.classes and PyTorch custom functions can be loaded using torch.ops after the shared library is loaded.

model.py

"""
model.py
========
Defines the four-layer IdentityModel used in the AOTInductor demo.

Layer layout
------------
  layer1 : IdentityConv           - native PyTorch operators
  layer2 : IdentityConvCustomClass - torch.classes  C++/CUDA custom class
  layer3 : IdentityConvCustomOp   - torch.library.custom_op  C++/CUDA op
  layer4 : IdentityConv           - native PyTorch operators

Every layer is an identity transformation, so model(x) == x for any input x.
"""

import torch
import torch.nn as nn



from custom_ops import identity_conv_op  





class IdentityConv(nn.Module):
    """Identity convolution implemented with native PyTorch operators.

    Uses a depthwise Conv2d with kernel_size=1 and weight=1.0, which is
    equivalent to a no-op (output == input).  This layer is compatible with
    torch.export and AOTInductor out of the box.
    """

    def __init__(self, channels: int) -> None:
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels=channels,
            out_channels=channels,
            kernel_size=(1, 1),
            stride=(1, 1),
            padding=(0, 0),
            dilation=(1, 1),
            groups=channels,
            bias=False,
        )
        
        self.conv.weight.data = torch.ones(channels, 1, 1, 1)
        
        self.conv.weight.requires_grad = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.conv(x)





class IdentityConvCustomClass(nn.Module):
    """Identity convolution backed by a torch.classes C++/CUDA custom class.

    At runtime the forward call is dispatched to the CUDA kernel registered
    inside IdentityConvClass (csrc/identity_conv.cpp + .cu).

    For torch.export compatibility a FakeIdentityConvClass is registered in
    custom_ops.py via @register_fake_class so that symbolic tracing works.
    """

    def __init__(self, channels: int) -> None:
        super().__init__()
        self.obj = torch.classes.my_ops.IdentityConvClass(channels)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.obj.forward(x)





class IdentityConvCustomOp(nn.Module):
    """Identity convolution backed by a torch.library.custom_op C++/CUDA op.

    The op (my_ops::identity_conv_op) is defined in custom_ops.py with:
      • a register_fake  implementation for torch.export tracing
      • a register_kernel("cuda") implementation that calls the CUDA kernel
    """

    def __init__(self, channels: int) -> None:
        super().__init__()
        self.channels = channels

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return identity_conv_op(x)





class IdentityModel(nn.Module):
    """Four-layer identity model for AOTInductor demo."""

    def __init__(self, channels: int) -> None:
        super().__init__()
        self.layer1 = IdentityConv(channels)
        self.layer2 = IdentityConvCustomClass(channels)
        self.layer3 = IdentityConvCustomOp(channels)
        self.layer4 = IdentityConv(channels)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x


def create_model(channels: int = 3) -> IdentityModel:
    """Return an IdentityModel in eval mode on the default CUDA device."""
    return IdentityModel(channels=channels).cuda().eval()

PyTorch Model Export and Lowering

The PyTorch model using custom classes and custom functions can be exported with torch.export if fake (abstract) versions of all custom classes and functions are registered for torch.export symbolic tracing.

export_compile.py


"""
export_compile.py
=================
Exports the IdentityModel with torch.export and compiles it with
torch._inductor.aoti_compile_and_package.

The resulting package (model.pt2) is written to the artifacts/ directory
and can be loaded by both run_inference.py (Python) and the C++ inference
binary.

Usage (run from the python/ directory):
    python export_compile.py
"""

import os
import sys


sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

import torch


import custom_ops  
from model import create_model




CHANNELS = 3
BATCH_SIZE = 1
HEIGHT = 224
WIDTH = 224


PACKAGE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                            "artifacts", "model.pt2")


def main() -> None:
    print("=" * 64)
    print("AOTInductor  -  Export & Compile")
    print("=" * 64)

    
    
    
    print(f"\n[1/4] Creating IdentityModel  (channels={CHANNELS}) ...")
    model = create_model(channels=CHANNELS)

    x = torch.randn(BATCH_SIZE,
                    CHANNELS,
                    HEIGHT,
                    WIDTH,
                    device="cuda",
                    dtype=torch.float32)

    with torch.no_grad():
        out = model(x)

    assert torch.equal(
        x, out), (f"Eager pre-export check FAILED  "
                  f"(max diff = {(x - out).abs().max().item():.2e})")
    print("  Eager verification  PASSED  (bitwise identical)")

    
    
    
    print("\n[2/4] Exporting model with torch.export.export() ...")
    with torch.no_grad():
        exported_program = torch.export.export(model, (x, ))
    print("  Export  DONE")
    print(f"\n  Exported graph:\n{exported_program.graph}")

    
    
    
    print(
        "\n[3/4] Compiling with torch._inductor.aoti_compile_and_package ...")
    package_path = torch._inductor.aoti_compile_and_package(
        exported_program,
        package_path=PACKAGE_PATH,
    )
    print(f"  Compilation  DONE")
    print(f"  Package saved to: {os.path.abspath(package_path)}")

    
    
    
    print(
        "\n[4/4] Quick sanity check: loading package and running inference ..."
    )
    compiled_model = torch._inductor.aoti_load_package(package_path)
    with torch.no_grad():
        out_compiled = compiled_model(x)

    
    if isinstance(out_compiled, (list, tuple)):
        out_compiled = out_compiled[0]

    assert torch.equal(x, out_compiled), (
        f"Compiled model sanity check FAILED  "
        f"(max diff = {(x - out_compiled).abs().max().item():.2e})")
    print("  Compiled model verification  PASSED  (bitwise identical)")

    print("\n" + "=" * 64)
    print(f"SUCCESS!  Package: {os.path.abspath(package_path)}")
    print("=" * 64)


if __name__ == "__main__":
    main()

From the exported graph we can see that the custom class IdentityConvClass.forward is represented as a call to torch.ops.higher_order.call_torchbind. The custom op identity_conv_op is represented as a call to torch.ops.my_ops.identity_conv_op.

graph():
    %p_layer1_conv_weight : [num_users=1] = placeholder[target=p_layer1_conv_weight]
    %p_layer4_conv_weight : [num_users=1] = placeholder[target=p_layer4_conv_weight]
    %obj_layer2_obj : [num_users=1] = placeholder[target=obj_layer2_obj]
    %x : [num_users=1] = placeholder[target=x]
    %conv2d : [num_users=1] = call_function[target=torch.ops.aten.conv2d.default](args = (%x, %p_layer1_conv_weight, None, [1, 1], [0, 0], [1, 1], 3), kwargs = {})
    %call_torchbind : [num_users=1] = call_function[target=torch.ops.higher_order.call_torchbind](args = (%obj_layer2_obj, forward, %conv2d), kwargs = {})
    %identity_conv_op : [num_users=1] = call_function[target=torch.ops.my_ops.identity_conv_op.default](args = (%call_torchbind,), kwargs = {})
    %conv2d_1 : [num_users=1] = call_function[target=torch.ops.aten.conv2d.default](args = (%identity_conv_op, %p_layer4_conv_weight, None, [1, 1], [0, 0], [1, 1], 3), kwargs = {})
    return (conv2d_1,)

The exported program can be compiled and packaged with torch._inductor.aoti_compile_and_package to produce a model.pt2 package that can be loaded by both Python and C++ inference programs. The custom class and custom op implementations will be loaded from the shared library and correctly dispatched at runtime when the compiled model is executed.

run_inference.py


"""
run_inference.py
================
Loads the AOTInductor-compiled IdentityModel package (model.pt2) and runs
inference to verify correctness.

The output of the identity model must equal the input within a tight
floating-point tolerance.

Usage (run from the python/ directory after export_compile.py):
    python run_inference.py [MODEL_PATH [OP_LIB_PATH]]

Arguments:
    MODEL_PATH   Path to the compiled model package (.pt2).
                 Defaults to ../artifacts/model.pt2 relative to this script.
    OP_LIB_PATH  Path to the custom-op shared library (.so).
                 When provided the library path is forwarded to custom_ops.py
                 via the IDENTITY_CONV_OPS_LIB environment variable so that
                 torch.ops.load_library uses that file instead of the default
                 ../ext/libidentity_conv_ops.so.
"""

import os
import sys


sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

import torch
import torch._inductor.codecache  




_DEFAULT_PACKAGE_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "..", "artifacts", "model.pt2")

PACKAGE_PATH = sys.argv[1] if len(sys.argv) > 1 else _DEFAULT_PACKAGE_PATH
OP_LIB_PATH = sys.argv[2] if len(sys.argv) > 2 else None





if OP_LIB_PATH is not None:
    os.environ["IDENTITY_CONV_OPS_LIB"] = os.path.abspath(OP_LIB_PATH)



import custom_ops  




CHANNELS = 3
BATCH_SIZE = 1
HEIGHT = 224
WIDTH = 224


def main() -> None:
    print("=" * 64)
    print("AOTInductor  -  Python Inference")
    print("=" * 64)

    
    
    
    pkg = os.path.abspath(PACKAGE_PATH)
    if OP_LIB_PATH is not None:
        print(f"  Op library  : {os.path.abspath(OP_LIB_PATH)}")
    print(f"\n[1/3] Loading compiled model from:\n  {pkg}")
    compiled_model = torch._inductor.aoti_load_package(pkg)
    print("  Model loaded successfully.")

    
    
    
    x = torch.randn(BATCH_SIZE,
                    CHANNELS,
                    HEIGHT,
                    WIDTH,
                    device="cuda",
                    dtype=torch.float32)
    print(f"\n[2/3] Input  shape={list(x.shape)}  dtype={x.dtype}  "
          f"device={x.device}")

    
    
    
    print("\n[3/3] Running inference ...")
    with torch.no_grad():
        output = compiled_model(x)

    
    if isinstance(output, (list, tuple)):
        output = output[0]

    print(f"  Output shape={list(output.shape)}  dtype={output.dtype}")

    if torch.equal(x, output):
        print("\n  Verification  PASSED  (bitwise identical)")
    else:
        max_diff = (x - output).abs().max().item()
        print(f"\n  Verification  FAILED  (max diff = {max_diff})"
              f" — expected bitwise identical output")
        sys.exit(1)

    print("\n" + "=" * 64)
    print("SUCCESS!  AOTInductor Python inference verified.")
    print("=" * 64)


if __name__ == "__main__":
    main()

The custom class and custom function shared library loading and registration can be performed using dlopen in a pure C++ inference program without any pybind11 or libpython dependency.

run_inference.cpp




























#include <cstdlib>
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>

#include <dlfcn.h> 

#include <torch/csrc/inductor/aoti_package/model_package_loader.h>
#include <torch/torch.h>




static constexpr int64_t kBatchSize = 1;
static constexpr int64_t kChannels = 3;
static constexpr int64_t kHeight = 224;
static constexpr int64_t kWidth = 224;

int main(int argc, char* argv[])
{
    if (argc < 3)
    {
        std::cerr << "Usage: " << argv[0] << " <path/to/model.pt2>"
                  << " <path/to/libidentity_conv_ops.so>" << std::endl;
        return EXIT_FAILURE;
    }

    const std::string model_path = argv[1];
    const std::string custom_op_lib = argv[2];

    std::cout << "================================================\n"
              << "AOTInductor  -  C++ Inference\n"
              << "================================================\n";

    try
    {
        
        
        
        
        
        
        
        std::cout << "\n[1/4] Loading custom op library:\n  " << custom_op_lib
                  << std::endl;

        void* lib_handle =
            dlopen(custom_op_lib.c_str(), RTLD_NOW | RTLD_GLOBAL);
        if (!lib_handle)
        {
            throw std::runtime_error(std::string("dlopen failed: ") +
                                     dlerror());
        }
        std::cout << "  Library loaded." << std::endl;

        
        
        
        
        
        
        std::cout << "\n[2/4] Loading model package:\n  " << model_path
                  << std::endl;

        torch::inductor::AOTIModelPackageLoader loader(model_path);
        auto runner = loader.get_runner();

        std::cout << "  Model loaded." << std::endl;

        
        
        
        auto options = torch::TensorOptions()
                           .dtype(torch::kFloat32)
                           .device(torch::kCUDA, 0);

        auto input =
            torch::randn({kBatchSize, kChannels, kHeight, kWidth}, options);

        std::cout << "\n[3/4] Input  shape=[" << kBatchSize << ", " << kChannels
                  << ", " << kHeight << ", " << kWidth << "]"
                  << "  dtype=float32  device=cuda" << std::endl;

        
        
        
        std::cout << "\n[4/4] Running inference ..." << std::endl;

        std::vector<at::Tensor> inputs = {input};
        auto outputs = runner->run(inputs);

        const auto& output = outputs[0];
        bool passed = input.equal(output);
        float max_diff = (input - output).abs().max().item<float>();

        std::cout << "  Output shape=[" << output.size(0) << ", "
                  << output.size(1) << ", " << output.size(2) << ", "
                  << output.size(3) << "]" << std::endl;
        std::cout << "  Max |input - output| = " << max_diff << std::endl;

        dlclose(lib_handle);

        if (passed)
        {
            std::cout << "\n  Verification  PASSED  (bitwise identical)"
                      << std::endl;
        }
        else
        {
            std::cerr << "\n  Verification  FAILED  (max diff = " << max_diff
                      << ")" << std::endl;
            return EXIT_FAILURE;
        }
    }
    catch (const std::exception& e)
    {
        std::cerr << "\nError: " << e.what() << std::endl;
        return EXIT_FAILURE;
    }

    std::cout << "\n================================================\n"
              << "SUCCESS!  AOTInductor C++ inference verified.\n"
              << "================================================\n";
    return EXIT_SUCCESS;
}