“OLDPAN博客”,侃侃而谈人工智能
深度酝酿优质原创文!
本文连接上篇<一个Tensor的生命历程(Pytorch版)-上篇>继续讲解Pytorch张量生成的内部机制。
紧接着继续回到at::native::empty_cpu
,因为empty_cpu要构建tensor变量,而tensor变量首先需要对应的storage,也就是Tensor中的实际储存的地址,而StorageImpl
是继承intrusive_ptr_target
的一个子类。实际代码中通过c10::make_intrusive
构建出storage_impl
:
Tensor empty_cpu(IntArrayRef size, const TensorOptions& options) {
......
int64_t nelements = prod_intlist(size);
auto dtype = options.dtype();
auto storage_impl = c10::make_intrusive<StorageImpl>(
dtype,
nelements,
allocator->allocate(nelements * dtype.itemsize()),
allocator,
/*resizeable=*/true);
make_intrusive
是模板元函数,其中TTarget
即传递过来的StorageImpl
类,而在函数参数位置中的Args&&... args
对应模板中的class... Args
,为变长参数列表,将c10::make_intrusive<StorageImpl>( dtype, nelements, allocator->allocate(nelements * dtype.itemsize()), allocator, /*resizeable=*/true);
中的函数参数通过Args
传递过来变为args
。
// c10/util/intrusive_ptr.h
template <
class TTarget,
class NullType = detail::intrusive_target_default_null_type<TTarget>,
class... Args>
inline intrusive_ptr<TTarget, NullType> make_intrusive(Args&&... args) {
return intrusive_ptr<TTarget, NullType>::make(std::forward<Args>(args)...);
}
通过make函数最终返回一个用intrusive_ptr
包裹的TTarget
类型的类,其中TTarget
就是StorageImpl
:
template <class... Args>
static intrusive_ptr make(Args&&... args) {
auto result = intrusive_ptr(new TTarget(std::forward<Args>(args)...));
// We can't use retain_(), because we also have to increase weakcount
// and because we allow raising these values from 0, which retain_()
// has an assertion against.
++result.target_->refcount_;
++result.target_->weakcount_;
return result;
}
intrusive_ptr
是一个智能指针,与intrusive_ptr_target
配合,只有继承intrusive_ptr_target
的类才可以使用intrusive_ptr<T>
,与shared_ptr<T>
不同,intrusive_ptr<T>
不会陷入循环计数的怪圈。
// c10/util/intrusive_ptr.h
template <
class TTarget,
class NullType = detail::intrusive_target_default_null_type<TTarget>>
class intrusive_ptr final {
public:
intrusive_ptr(const intrusive_ptr& rhs) : target_(rhs.target_) {
retain_();
}
~intrusive_ptr() noexcept {
reset_();
}
private:
TTarget* target_;
void retain_() {
size_t new_refcount = ++target_->refcount_;
}
void reset_() noexcept {
if (target_ != NullType::singleton() && --target_->refcount_ == 0) {
auto weak_count = --target_->weakcount_;
const_cast<c10::guts::remove_const_t<TTarget>*>(target_)->release_resources();
if (weak_count == 0) {
delete target_;
}
}
intrusive_ptr_target
不会循环计数的两个核心成员变量,支持原子操作。
class C10_API intrusive_ptr_target {
mutable std::atomic<size_t> refcount_;
mutable std::atomic<size_t> weakcount_;
显然StorageImpl
继承自intrusive_ptr_target
:
// c10/core/StorageImpl.h
struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
public:
StorageImpl(caffe2::TypeMeta data_type, int64_t numel, at::DataPtr data_ptr,
at::Allocator* allocator, bool resizable);
private:
caffe2::TypeMeta data_type_; // 数据类型
DataPtr data_ptr_; // 指向存储数据的内存块
int64_t numel_; // 数据总数
bool resizable_;
bool received_cuda_;
Allocator* allocator_; // 内存分配器
可以看到实际的数据块的类型为DataPtr
,其中包含了删除器以及当前数据的设备信息。
// c10/core/Allocator.h
class C10_API DataPtr {
private:
c10::detail::UniqueVoidPtr ptr_;
Device device_;
public:
DataPtr() : ptr_(), device_(DeviceType::CPU) {}
DataPtr(void* data, Device device) : ptr_(data), device_(device) {}
DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
: ptr_(data, ctx, ctx_deleter), device_(device) {}
再看其中的UniqueVoidPtr
,这个ptr类似于unique_ptr
,但还是有几点不同的地方,例如该指针只针对void类型。
// c10/util/UniqueVoidPtr.h
class UniqueVoidPtr {
private:
// Lifetime tied to ctx_
void* data_;
std::unique_ptr<void, DeleterFnPtr> ctx_;
public:
UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {}
explicit UniqueVoidPtr(void* data)
: data_(data), ctx_(nullptr, &deleteNothing) {}
UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter)
: data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
void* operator->() const {
return data_;
}
void clear() {
ctx_ = nullptr;
data_ = nullptr;
}
…
回到empty_cpu,在初始化storage_impl
后开始构建TensorImpl
,通过make_tensor
传递Tensor的类型以及相关函数参数:
// aten/src/ATen/native/TensorFactories.cpp
Tensor empty_cpu(IntArrayRef size, const TensorOptions& options) {
......
auto tensor = detail::make_tensor<TensorImpl>(storage_impl, at::CPUTensorId());
make_tensor
函数中返回Tensor
类,从而构造了一个Tensor。
// build/aten/src/ATen/core/TensorBody.h
template <typename T, typename... Args>
Tensor make_tensor(Args&&... args) {
return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
}
这个Tensor是一个通用的对象,包含一个指向TensorImpl
对象的指针,实际开辟的空间位置指针还在TensorImpl
中的storage_
中。
// build/aten/src/ATen/core/TensorBody.h
class CAFFE2_API Tensor {
protected:
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
public:
int64_t dim() const {
return impl_->dim();
}
int64_t storage_offset() const {
return impl_->storage_offset();
}
Tensor abs() const;
Tensor& abs_();
Tensor add(const Tensor & other, Scalar alpha=1) const;
TensorImpl
类也是继承了intrusive_ptr_target
,拥有智能指针的功能。
// c10/core/TensorImpl.h
struct C10_API TensorImpl : public c10::intrusive_ptr_target {
public:
virtual int64_t dim() const;
virtual int64_t storage_offset() const;
private:
Storage storage_;
再回顾一下创建Tensor时实际涉及到的类:
class CAFFE2_API Tensor {
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
...
struct C10_API TensorImpl : public c10::intrusive_ptr_target {
Storage storage_;
...
struct C10_API Storage {
protected:
c10::intrusive_ptr<StorageImpl> storage_impl_;
...
struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
DataPtr data_ptr_;
...
class C10_API DataPtr {
c10::detail::UniqueVoidPtr ptr_;
...
class UniqueVoidPtr {
std::unique_ptr<void, DeleterFnPtr> ctx_;
...
接下来回到rand
,在通过at::empty
构造出empty的Tensor后需要使用uniform_
对其进行初始化。
// aten/src/ATen/native/TensorFactories.cpp
Tensor rand(IntArrayRef size, Generator* generator, const TensorOptions& options) {
auto result = at::empty(size, options);
return result.uniform_(0, 1, generator);
}
Tensor::uniform_
是Tensor类中的一个方法,实现对Tensor中数据的操作。
// build/aten/src/ATen/core/TensorMethods.h
inline Tensor & Tensor::uniform_(double from, double to, Generator * generator) const {
static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::uniform_", ""}).value();
return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, double, Generator *>(
op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), from, to, generator);
}
但是Tensor::uniform_
实际调用的函数是找到通过注册机制注册好的函数,这个函数是在编译的过程中按照native_functions.yaml
文件中的指示代码生成。
可以看到,在native_functions.yaml
中的函数uniform_
还对应了两个不同平台(CPU和GPU)的方法,这里我们主要看legacy::cpu::_th_uniform_
// aten/src/ATen/native/native_functions.yaml
- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
variants: method
dispatch:
CPU: legacy::cpu::_th_uniform_
CUDA: uniform_cuda_
生成的代码如下,也就是callUnboxedOnly
中根据模板元参数和函数参数实际返回并执行的函数:
// build/aten/src/ATen/CPUType.cpp
Tensor & uniform_(Tensor & self, double from, double to, Generator * generator) {
const OptionalDeviceGuard device_guard(device_of(self));
return at::native::legacy::cpu::_th_uniform_(self, from, to, generator);
}
其中at::native::legacy::cpu::_th_uniform_
是自动生成的代码,生成规则如下:
// aten/src/ATen/Declarations.cwrap
name: _th_uniform_
types:
- floating_point
backends:
- CPU
cname: uniform
variants: function
return: self
arguments:
- THTensor* self
- double from
- double to
- THGenerator* generator
进入at::native::legacy::cpu::_th_uniform_
,显然默然会选择ScalarType::Float
这个分支:
// build/aten/src/ATen/LegacyTHFunctionsCPU.cpp
Tensor & _th_uniform_(Tensor & self, double from, double to, Generator * generator) {
需要注意Pytorch中使用C语言的宏定义语法实现了多态,上述的THFloatTensor_uniform
对应通过宏定义展开的函数,也就是下面的函数在编译过程中通过宏定义的方式展开生成THFloatTensor_uniform
,具体的解释可以看这里。
而下面这个函数中的TH_TENSOR_APPLY
类似于map函数,对Tensor中每一个元素执行该操作,具体这里不进行深入。
void THTensor_(uniform)(THTensor *self, double a, double b, at::Generator *_generator)
{
auto gen = at::get_generator_or_default<at::CPUGenerator>(_generator, at::detail::getDefaultCPUGenerator());
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(gen->mutex_);
紧接着进行下一步,在对Tensor初始化之后,我们该执行torch.rand(3, 4)[0]
这一步中最后的索引[0]
操作,对应:
_t1 = torch.rand(3, 4)
_t2 = _t1.__getitem__(0)
剩下的步骤就与之前的原理相同,之后只展示代码流程,就不进行详细描述了:
// torch/csrc/autograd/python_variable.cpp
PyTypeObject THPVariableType = {
PyVarObject_HEAD_INIT(nullptr, 0)
"torch._C._TensorBase", /* tp_name */
sizeof(THPVariable), /* tp_basicsize */
(destructor)THPVariable_dealloc, /* tp_dealloc */
&THPVariable_as_mapping, /* tp_as_mapping */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
(traverseproc)THPVariable_traverse, /* tp_traverse */
(inquiry)THPVariable_clear, /* tp_clear */
THPVariable_properties, /* tp_getset */
THPVariable_pynew /* tp_new */
};PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
if (index == Py_None) {
return wrap(self_.unsqueeze(0));
} else if (index == Py_Ellipsis) {
return wrap(at::alias(self_));
} else if (THPUtils_checkLong(index)) {
return wrap(applySelect(self_, 0, THPUtils_unpackLong(index)));
} else if (PySlice_Check(index)) {
return wrap(applySlice(self_, 0, index, true));
}
// wrap index in a tuple if it's not already one
THPObjectPtr holder = wrapTuple(index);
variable_list variableIndices;
Variable sliced = applySlicing(self_, holder.get(), variableIndices);
...
static Variable applySelect(const Variable& self, int64_t dim, int64_t index,
int64_t real_dim=0) {
int64_t size = self.size(dim);
return self.select(dim, index);
}// aten/src/ATen/core/TensorMethods.h
inline Tensor Tensor::select(int64_t dim, int64_t index) const {
static auto table = globalATenDispatch().getOpTable("aten::select(Tensor(a) self, int dim, int index) -> Tensor(a)");
return table->getOp<Tensor (const Tensor &, int64_t, int64_t)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, dim, index);
}
aten/src/ATen/native/native_functions.yaml
- func: select(Tensor(a) self, int dim, int index) -> Tensor(a)
variants: function, method
device_guard: False
named_guard: False
build/aten/src/ATen/TypeDefault.cpp
auto registerer = torch::RegisterOperators()
.op(torch::RegisterOperators::options()
.schema("aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)")
.impl_unboxedOnlyC10CatchAllKernel<Tensor (const Tensor &, int64_t, int64_t), &TypeDefault::select>()
.aliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA))
...
Tensor TypeDefault::select(const Tensor & self, int64_t dim, int64_t index) {
return at::native::select(self, dim, index);
}
aten/src/ATen/native/TensorShape.cpp
Tensor select(const Tensor& self, int64_t dim, int64_t index) {
auto sizes = self.sizes().vec();
auto strides = self.strides().vec();
auto storage_offset = self.storage_offset() + index * strides[dim];
sizes.erase(sizes.begin() + dim);
strides.erase(strides.begin() + dim);
auto result = self.as_strided(sizes, strides, storage_offset);
build/aten/src/ATen/core/TensorMethods.h
inline Tensor Tensor::as_strided(IntArrayRef size, IntArrayRef stride, c10::optional<int64_t> storage_offset) const {
static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::as_strided", ""}).value();
return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, IntArrayRef, c10::optional<int64_t>>(
op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size, stride, storage_offset);
}
aten/src/ATen/native/native_functions.yaml
- func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a)
variants: function, method
dispatch:
CPU: as_strided_tensorimpl
CUDA: as_strided_tensorimpl
aten/src/ATen/native/TensorShape.cpp
Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
auto storage_offset = storage_offset_.value_or(self.storage_offset());
auto result = detail::make_tensor<TensorImpl>(Storage(self.storage()), self.type_set());
setStrided(result, size, stride, storage_offset);
return result;
}
c10/core/Storage.h
struct C10_API Storage {
protected:
c10::intrusive_ptr<StorageImpl> storage_impl_;
接下来一步,释放_t1.
_t1 = torch.rand(3, 4)
_t2 = _t1.__getitem__(0)
del _t1
torch/tensor.py
class Tensor(torch._C._TensorBase):
torch/csrc/autograd/python_variable.cpp
PyTypeObject THPVariableType = {
PyVarObject_HEAD_INIT(nullptr, 0)
"torch._C._TensorBase", /* tp_name */
sizeof(THPVariable), /* tp_basicsize */
(destructor)THPVariable_dealloc, /* tp_dealloc */
&THPVariable_as_mapping, /* tp_as_mapping */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
(traverseproc)THPVariable_traverse, /* tp_traverse */
(inquiry)THPVariable_clear, /* tp_clear */
THPVariable_properties, /* tp_getset */
THPVariable_pynew /* tp_new */
};static void THPVariable_dealloc(THPVariable* self)
{
PyObject_GC_UnTrack(self);
THPVariable_clear(self);
self->cdata.~Variable();
Py_TYPE(self)->tp_free((PyObject*)self);
}
torch/csrc/autograd/python_variable.h
struct THPVariable {
PyObject_HEAD
torch::autograd::Variable cdata;
PyObject* backward_hooks = nullptr;
};
torch/csrc/autograd/variable.h
struct TORCH_API Variable : public at::Tensor {
...
class CAFFE2_API Tensor {
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
...
struct C10_API TensorImpl : public c10::intrusive_ptr_target {
Storage storage_;
...
struct C10_API Storage {
protected:
c10::intrusive_ptr<StorageImpl> storage_impl_;
...
struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
DataPtr data_ptr_;
...
class C10_API DataPtr {
c10::detail::UniqueVoidPtr ptr_;
...
class UniqueVoidPtr {
std::unique_ptr<void, DeleterFnPtr> ctx_;
...
void free_cpu(void* data) {
最后一步,相加。
_t1 = torch.rand(3, 4)
_t2 = _t1.__getitem__(0)
del _t1
_t3 = torch.rand(3, 4)
r = _t2.__add__(_t3)
tools/autograd/templates/python_variable_methods.cpp
PyMethodDef variable_methods[] = {
{"__add__", (PyCFunction)THPVariable_add, METH_VARARGS | METH_KEYWORDS, NULL},
{"__radd__", (PyCFunction)THPVariable_add, METH_VARARGS | METH_KEYWORDS, NULL},
{"__iadd__", (PyCFunction)THPVariable_add_, METH_VARARGS | METH_KEYWORDS, NULL},bool THPVariable_initModule(PyObject *module)
{
static std::vector<PyMethodDef> methods;
THPUtils_addPyMethodDefs(methods, torch::autograd::variable_methods);
PyModule_AddObject(module, "_TensorBase", (PyObject *)&THPVariableType);
aten/src/ATen/native/native_functions.yaml
- func: add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
variants: function, method
dispatch:
CPU: add
CUDA: add
SparseCPU: add
SparseCUDA: add
MkldnnCPU: mkldnn_add
torch/csrc/autograd/generated/python_variable_methods.cpp
static PyObject * THPVariable_add(PyObject* self_, PyObject* args, PyObject* kwargs)
{
HANDLE_TH_ERRORS
static PythonArgParser parser({
"add(Scalar alpha, Tensor other)|deprecated",
"add(Tensor other, *, Scalar alpha=1)",
}, /*traceable=*/true);
auto& self = reinterpret_cast<THPVariable*>(self_)->cdata;
ParsedArgs<3> parsed_args;
auto r = parser.parse(args, kwargs, parsed_args);
if (r.idx == 0) {
return wrap(dispatch_add(self, r.scalar(0), r.tensor(1)));
} else if (r.idx == 1) {
return wrap(dispatch_add(self, r.tensor(0), r.scalar(1)));
}
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}
torch/csrc/autograd/generated/python_variable_methods_dispatch.h
inline Tensor dispatch_add(Tensor & self, const Tensor & other, Scalar alpha) {
AutoNoGIL no_gil;
return self.add(other, alpha);
}
build/aten/src/ATen/core/TensorMethods.h
inline Tensor Tensor::add(const Tensor & other, Scalar alpha) const {
static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::add", "Tensor"}).value();
return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, alpha);
}
aten/src/ATen/native/BinaryOps.cpp
namespace at {
namespace native {
Tensor add(const Tensor& self, const Tensor& other, Scalar alpha) {
Tensor result;
auto iter = TensorIterator::binary_op(result, self, other);
add_stub(iter->device_type(), *iter, alpha);
return iter->output();
}
aten/src/ATen/native/TensorIterator.cpp
TensorIterator TensorIterator::binary_op(Tensor& out, const Tensor& a,
const Tensor& b, bool check_mem_overlap) {
auto iter = TensorIterator();
iter.set_check_mem_overlap(check_mem_overlap);
iter.add_output(out);
iter.add_input(a);
iter.add_input(b);
iter.allow_cpu_scalars_ = true;
iter.build();
return iter;
}void TensorIterator::build() {
// set is_output and is_read_write flags on appropriate tensors
mark_outputs();
// Check that the outputs have no internal overlap
// and do not share memory with inputs.
check_mem_overlaps();
// compute the broadcasted shape
compute_shape();
// compute each tensor's stride after broadcasting
compute_strides();
// re-order dimensions to improve coalescing
reorder_dimensions();
// compute the result dtype and device
compute_types();
// allocate the output tensor if it's not provided
allocate_outputs();
// coalesce adjacent dimensions when possible
coalesce_dimensions();
for (auto& op : operands_) {
TORCH_INTERNAL_ASSERT(op.tensor.defined());
op.data = op.tensor.data_ptr();
}
}void TensorIterator::allocate_outputs() {
for (int i = 0; i < num_outputs_; i++) {
auto& op = operands_[i];
if (!op.tensor.defined()) {
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
int element_size = elementSize(op.dtype);
op.stride_bytes = compatible_stride(element_size);
auto tensor_shape = invert_perm(shape_);
auto tensor_stride = invert_perm(op.stride_bytes);
for (int dim = 0; dim < ndim(); dim++) {
tensor_stride[dim] /= element_size;
}
op.tensor = at::empty_strided(tensor_shape, tensor_stride, op.options());
}
}
}
aten/src/ATen/native/BinaryOps.h
using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha);
DECLARE_DISPATCH(binary_fn_alpha, add_stub);
aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
REGISTER_DISPATCH(add_stub, &add_kernel);
aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
void add_kernel(TensorIterator& iter, Scalar alpha_scalar) {
if (iter.dtype() == ScalarType::Bool) {
using scalar_t = bool;
auto alpha = alpha_scalar.to<scalar_t>();
cpu_kernel(iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a + alpha * b; });
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "add_cpu/sub_cpu", [&]() {
auto alpha = alpha_scalar.to<scalar_t>();
auto alpha_vec = Vec256<scalar_t>(alpha);
cpu_kernel_vec(iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a + alpha * b; },
[=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
return vec256::fmadd(b, alpha_vec, a);
});
});
}
}
之后的del _t3操作前面已经介绍过了,不再赘述。
_t1 = torch.rand(3, 4)
_t2 = _t1.__getitem__(0)
del _t1
_t3 = torch.rand(3, 4)
r = _t2.__add__(_t3)
del _t2
至此所有操作以及源码流程结束,可以看到Pytorch的张量机制涉及到很多的技巧,其中的源码值得我们细细品味。
参考链接
https://www.52coding.com.cn/2019/05/05/PyTorch5/
https://github.com/Microsoft/vscode-cpptools/issues/891
https://github.com/pytorch/pytorch/wiki/Life-of-a-Tensor
如果感觉有收获,不妨分享一下吧~
阅读原文
声明:文中观点不代表本站立场。本文传送门:https://eyangzhen.com/202627.html