Skip to content

Commit 978e99f

Browse files
committed
Another attempt at futureproof the workaround for CUDA bool
1 parent b24abbb commit 978e99f

File tree

1 file changed

+20
-6
lines changed

1 file changed

+20
-6
lines changed

src/sgl/device/cursor_access_wrappers.h

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,17 @@ class SGL_API CursorWriteWrappers {
4848
// CPU is assumed tightly packed, i.e., stride and size are the same value.
4949
size_t cpu_element_size = cursor_utils::get_scalar_type_cpu_size(cpu_scalar_type);
5050
size_t element_stride = _get_slang_type_layout()->getElementStride(SLANG_PARAMETER_CATEGORY_UNIFORM);
51-
5251
// CUDA misreports the actual element stride, see https://github.com/shader-slang/slang/issues/7441
53-
// We will pretend that element stride is equal to the total stride divided by number of elements.
54-
// size_t element_stride = _get_slang_type_layout()->getStride() / element_count;
52+
// In the old implementation, bool2-4 are implemented as int2-4, and even though bool1 is implemented as int1,
53+
// the actual emitted code is bool. So for bool2-4, the element stride is 4, for bool1 it remains at 1.
54+
// The check for the total size == sizeof(int) * element_count is to disable this on newer Slang implementation,
55+
// where bool1-4 is implemented as an actual struct of 1-4 bools.
56+
if (cpu_scalar_type == TypeReflection::ScalarType::bool_ && _get_device_type_internal() == DeviceType::cuda
57+
&& _get_slang_type_layout()->getKind() == slang::TypeReflection::Kind::Vector
58+
&& _get_slang_type_layout()->getSize() == sizeof(int) * element_count) {
59+
if (element_count > 1)
60+
element_stride = 4;
61+
}
5562
size_t element_size = _get_slang_type_layout()->getElementTypeLayout()->getSize();
5663

5764
SGL_CHECK(
@@ -237,10 +244,17 @@ class SGL_API CursorReadWrappers {
237244
// CPU is assumed tightly packed, i.e., stride and size are the same value.
238245
size_t cpu_element_size = cursor_utils::get_scalar_type_cpu_size(cpu_scalar_type);
239246
size_t element_stride = _get_slang_type_layout()->getElementStride(SLANG_PARAMETER_CATEGORY_UNIFORM);
240-
241247
// CUDA misreports the actual element stride, see https://github.com/shader-slang/slang/issues/7441
242-
// We will pretend that element stride is equal to the total stride divided by number of elements.
243-
// size_t element_stride = _get_slang_type_layout()->getStride() / element_count;
248+
// In the old implementation, bool2-4 are implemented as int2-4, and even though bool1 is implemented as int1,
249+
// the actual emitted code is bool. So for bool2-4, the element stride is 4, for bool1 it remains at 1.
250+
// The check for the total size == sizeof(int) * element_count is to disable this on newer Slang implementation,
251+
// where bool1-4 is implemented as an actual struct of 1-4 bools.
252+
if (cpu_scalar_type == TypeReflection::ScalarType::bool_ && _get_device_type_internal() == DeviceType::cuda
253+
&& _get_slang_type_layout()->getKind() == slang::TypeReflection::Kind::Vector
254+
&& _get_slang_type_layout()->getSize() == sizeof(int) * element_count) {
255+
if (element_count > 1)
256+
element_stride = 4;
257+
}
244258
size_t element_size = _get_slang_type_layout()->getElementTypeLayout()->getSize();
245259

246260
SGL_CHECK(

0 commit comments

Comments
 (0)