@@ -48,10 +48,17 @@ class SGL_API CursorWriteWrappers {
48
48
// CPU is assumed tightly packed, i.e., stride and size are the same value.
49
49
size_t cpu_element_size = cursor_utils::get_scalar_type_cpu_size (cpu_scalar_type);
50
50
size_t element_stride = _get_slang_type_layout ()->getElementStride (SLANG_PARAMETER_CATEGORY_UNIFORM);
51
-
52
51
// CUDA misreports the actual element stride, see https://github.com/shader-slang/slang/issues/7441
53
- // We will pretend that element stride is equal to the total stride divided by number of elements.
54
- // size_t element_stride = _get_slang_type_layout()->getStride() / element_count;
52
+ // In the old implementation, bool2-4 are implemented as int2-4, and even though bool1 is implemented as int1,
53
+ // the actual emitted code is bool. So for bool2-4, the element stride is 4, for bool1 it remains at 1.
54
+ // The check for the total size == sizeof(int) * element_count is to disable this on newer Slang implementation,
55
+ // where bool1-4 is implemented as an actual struct of 1-4 bools.
56
+ if (cpu_scalar_type == TypeReflection::ScalarType::bool_ && _get_device_type_internal () == DeviceType::cuda
57
+ && _get_slang_type_layout ()->getKind () == slang::TypeReflection::Kind::Vector
58
+ && _get_slang_type_layout ()->getSize () == sizeof (int ) * element_count) {
59
+ if (element_count > 1 )
60
+ element_stride = 4 ;
61
+ }
55
62
size_t element_size = _get_slang_type_layout ()->getElementTypeLayout ()->getSize ();
56
63
57
64
SGL_CHECK (
@@ -237,10 +244,17 @@ class SGL_API CursorReadWrappers {
237
244
// CPU is assumed tightly packed, i.e., stride and size are the same value.
238
245
size_t cpu_element_size = cursor_utils::get_scalar_type_cpu_size (cpu_scalar_type);
239
246
size_t element_stride = _get_slang_type_layout ()->getElementStride (SLANG_PARAMETER_CATEGORY_UNIFORM);
240
-
241
247
// CUDA misreports the actual element stride, see https://github.com/shader-slang/slang/issues/7441
242
- // We will pretend that element stride is equal to the total stride divided by number of elements.
243
- // size_t element_stride = _get_slang_type_layout()->getStride() / element_count;
248
+ // In the old implementation, bool2-4 are implemented as int2-4, and even though bool1 is implemented as int1,
249
+ // the actual emitted code is bool. So for bool2-4, the element stride is 4, for bool1 it remains at 1.
250
+ // The check for the total size == sizeof(int) * element_count is to disable this on newer Slang implementation,
251
+ // where bool1-4 is implemented as an actual struct of 1-4 bools.
252
+ if (cpu_scalar_type == TypeReflection::ScalarType::bool_ && _get_device_type_internal () == DeviceType::cuda
253
+ && _get_slang_type_layout ()->getKind () == slang::TypeReflection::Kind::Vector
254
+ && _get_slang_type_layout ()->getSize () == sizeof (int ) * element_count) {
255
+ if (element_count > 1 )
256
+ element_stride = 4 ;
257
+ }
244
258
size_t element_size = _get_slang_type_layout ()->getElementTypeLayout ()->getSize ();
245
259
246
260
SGL_CHECK (
0 commit comments