NVIDIA · JanuszL · Aug 9, 2024 · Aug 6, 2024 · Aug 7, 2024
diff --git a/dali/benchmark/dali_bench.h b/dali/benchmark/dali_bench.h
@@ -60,7 +60,7 @@ class DALIBenchmark : public benchmark::Fixture {
     tl->Resize(shape, DALI_UINT8);
 
     for (int i = 0; i < n; ++i) {
-      std::memcpy(tl->template mutable_tensor<uint8>(i),
+      std::memcpy(tl->template mutable_tensor<uint8_t>(i),
           jpegs_.data_[i % nImgs], jpegs_.sizes_[i % nImgs]);
       tl->SetSourceInfo(i, jpeg_names_[i % nImgs] + "_" + std::to_string(i));
     }

diff --git a/dali/fuzzing/dali_harness.h b/dali/fuzzing/dali_harness.h
@@ -52,7 +52,7 @@ class FileListHarness {
 
     for (int i = 0; i < batch_size_; ++i) {
       std::memcpy(
-        input_data_.template mutable_tensor<uint8>(i),
+        input_data_.template mutable_tensor<uint8_t>(i),
         images_.data_[i],
         images_.sizes_[i]);
       input_data_.SetSourceInfo(i, image_names_[i] + "_" + std::to_string(i));

diff --git a/dali/kernels/transpose/transpose_gpu_impl_test.cu b/dali/kernels/transpose/transpose_gpu_impl_test.cu
@@ -192,10 +192,10 @@ TEST(TransposeTiled, BuildDescVectorized) {
 TEST(TransposeTiled, BuildDescAndForceMisalignment) {
   TensorShape<> shape = { 57, 37, 52, 4 };  // a bunch of primes, just to make it harder
   int size = volume(shape);
-  vector<uint8> in_cpu(size + 4), out_cpu(size + 4);
-  vector<uint8> ref(size + 4);
+  vector<uint8_t> in_cpu(size + 4), out_cpu(size + 4);
+  vector<uint8_t> ref(size + 4);
 
-  DeviceBuffer<uint8> in_gpu, out_gpu;
+  DeviceBuffer<uint8_t> in_gpu, out_gpu;
   in_gpu.resize(size + 4);
   out_gpu.resize(size + 4);
 
@@ -208,7 +208,7 @@ TEST(TransposeTiled, BuildDescAndForceMisalignment) {
     SmallVector<int, 6> perm = { 1, 2, 0, 3 };
 
     int grid_size = 1024;
-    TiledTransposeDesc<uint8> desc;
+    TiledTransposeDesc<uint8_t> desc;
     memset(&desc, 0xCC, sizeof(desc));
     InitTiledTranspose(desc, shape, make_span(perm), out_gpu.data() + offset,
                        in_gpu.data() + offset, grid_size);
@@ -258,10 +258,10 @@ TEST(TransposeTiled, BuildDescVectorized16BitOpt) {
 TEST(TransposeTiled, HighDimensionTest) {
   TensorShape<> shape = {3, 3, 5, 7, 23, 3, 37, 4 };  // a bunch of primes, just to make it harder
   int size = volume(shape);
-  vector<uint8> in_cpu(size), out_cpu(size);
-  vector<uint8> ref(size);
+  vector<uint8_t> in_cpu(size), out_cpu(size);
+  vector<uint8_t> ref(size);
 
-  DeviceBuffer<uint8> in_gpu, out_gpu;
+  DeviceBuffer<uint8_t> in_gpu, out_gpu;
   in_gpu.resize(size);
   out_gpu.resize(size);
 
@@ -276,7 +276,7 @@ TEST(TransposeTiled, HighDimensionTest) {
     SmallVector<int, 8> perm = { 1, 0, 4, 2, 6, 3, 5, 7 };
 
     int grid_size = 1024;
-    TiledTransposeDesc<uint8> desc;
+    TiledTransposeDesc<uint8_t> desc;
     memset(&desc, 0xCC, sizeof(desc));
     InitTiledTranspose(desc, shape, make_span(perm), out_gpu.data(), in_gpu.data(), grid_size);
 

diff --git a/dali/kernels/transpose/transpose_gpu_test.cc b/dali/kernels/transpose/transpose_gpu_test.cc
@@ -203,7 +203,7 @@ TEST(TransposeGPU, PerfDeinterleave) {
 
   std::cerr << "Permuting 1-byte data; permutation 2 0 1\ninput shape = \n" << shape << "\n";
 
-  RunPerfTest<uint8>(rng, shape, make_span(perm));
+  RunPerfTest<uint8_t>(rng, shape, make_span(perm));
 }
 
 
@@ -226,7 +226,7 @@ TEST(TransposeGPU, PerfInterleave) {
 
   std::cerr << "Permuting 1-byte data; permutation 1 2 0\ninput shape = \n" << shape << "\n";
 
-  RunPerfTest<uint8>(rng, shape, make_span(perm));
+  RunPerfTest<uint8_t>(rng, shape, make_span(perm));
 }
 
 

diff --git a/dali/operators/debug/dump_image.cc b/dali/operators/debug/dump_image.cc
@@ -35,7 +35,7 @@ void DumpImage<CPUBackend>::RunImpl(SampleWorkspace &ws) {
                make_string("Only 3-channel and gray images are supported, got input with `", c,
                            "` channels."));
 
-  WriteHWCImage(input.template data<uint8>(),
+  WriteHWCImage(input.template data<uint8_t>(),
       h, w, c, std::to_string(ws.data_idx()) + "-" + suffix_ + "-" + std::to_string(0));
 
   // Forward the input

diff --git a/dali/operators/decoder/host/host_decoder.cc b/dali/operators/decoder/host/host_decoder.cc
@@ -28,13 +28,13 @@ void HostDecoder::RunImpl(SampleWorkspace &ws) {
 
   // Verify input
   DALI_ENFORCE(input.ndim() == 1, "Input must be 1D encoded jpeg string.");
-  DALI_ENFORCE(IsType<uint8>(input.type()), "Input must be stored as uint8 data.");
+  DALI_ENFORCE(IsType<uint8_t>(input.type()), "Input must be stored as uint8 data.");
 
   std::unique_ptr<Image> img;
   try {
     DomainTimeRange tr(make_string("Decode #", ws.data_idx(), " fast_idct=", use_fast_idct_),
                        DomainTimeRange::kBlue1);
-    img = ImageFactory::CreateImage(input.data<uint8>(), input.size(), output_type_);
+    img = ImageFactory::CreateImage(input.data<uint8_t>(), input.size(), output_type_);
     img->SetCropWindowGenerator(GetCropWindowGenerator(ws.data_idx()));
     img->SetUseFastIdct(use_fast_idct_);
     img->Decode();

diff --git a/dali/operators/decoder/image/image_factory.cc b/dali/operators/decoder/image/image_factory.cc
@@ -30,7 +30,7 @@ namespace dali {
 
 namespace {
 
-bool CheckIsJPEG(const uint8 *jpeg, int) {
+bool CheckIsJPEG(const uint8_t *jpeg, int) {
   DALI_ENFORCE(jpeg);
   return (jpeg[0] == 255) && (jpeg[1] == 216);
 }

diff --git a/dali/operators/decoder/image/jpeg.cc b/dali/operators/decoder/image/jpeg.cc
@@ -28,7 +28,7 @@ JpegImage::JpegImage(const uint8_t *encoded_buffer,
 }
 
 #ifndef DALI_USE_JPEG_TURBO
-bool get_jpeg_size(const uint8 *data, size_t data_size, int *height, int *width, int *nchannels) {
+bool get_jpeg_size(const uint8_t *data, size_t data_size, int *height, int *width, int *nchannels) {
   unsigned int i = 0;
   if (!(data[i] == 0xFF && data[i + 1] == 0xD8))
     return false;  // Not a valid SOI header
@@ -59,7 +59,7 @@ bool get_jpeg_size(const uint8 *data, size_t data_size, int *height, int *width,
 #endif
 
 std::pair<std::shared_ptr<uint8_t>, Image::Shape>
-JpegImage::DecodeImpl(DALIImageType type, const uint8 *jpeg, size_t length) const {
+JpegImage::DecodeImpl(DALIImageType type, const uint8_t *jpeg, size_t length) const {
   const auto shape = PeekShapeImpl(jpeg, length);
   const auto h = shape[0];
   const auto w = shape[1];

diff --git a/dali/operators/decoder/image/tiff_libtiff.cc b/dali/operators/decoder/image/tiff_libtiff.cc
@@ -278,7 +278,7 @@ Image::Shape TiffImage_Libtiff::PeekShapeImpl(const uint8_t *encoded_buffer,
 
 std::pair<std::shared_ptr<uint8_t>, Image::Shape>
 TiffImage_Libtiff::DecodeImpl(DALIImageType image_type,
-                              const uint8 *encoded_buffer,
+                              const uint8_t *encoded_buffer,
                               size_t length) const {
   // This decoder only handles bitdepth=8, non-tiled and top-left orientation
   // Other cases go to OpenCV's based decoder

diff --git a/dali/operators/decoder/jpeg/jpeg_handle.cc b/dali/operators/decoder/jpeg/jpeg_handle.cc
@@ -186,12 +186,12 @@ boolean MemFillInputBuffer(j_decompress_ptr cinfo) {
 void MemTermSource(j_decompress_ptr cinfo) {}
 
 // -----------------------------------------------------------------------------
-void MemSkipInputData(j_decompress_ptr cinfo, int64 jump) {
+void MemSkipInputData(j_decompress_ptr cinfo, int64_t jump) {
   MemSourceMgr *src = reinterpret_cast<MemSourceMgr *>(cinfo->src);
   if (jump < 0) {
     return;
   }
-  if (jump > static_cast<int64>(src->pub.bytes_in_buffer)) {
+  if (jump > static_cast<int64_t>(src->pub.bytes_in_buffer)) {
     src->pub.bytes_in_buffer = 0;
     (void)MemFillInputBuffer(cinfo);  // warn with a fake EOI or error
   } else {
@@ -202,7 +202,7 @@ void MemSkipInputData(j_decompress_ptr cinfo, int64 jump) {
 
 // -----------------------------------------------------------------------------
 void SetSrc(j_decompress_ptr cinfo, const void *data,
-            uint64 datasize, bool try_recover_truncated_jpeg) {
+            uint64_t datasize, bool try_recover_truncated_jpeg) {
   MemSourceMgr *src;
 
   cinfo->src = reinterpret_cast<struct jpeg_source_mgr *>(

diff --git a/dali/operators/decoder/jpeg/jpeg_handle.h b/dali/operators/decoder/jpeg/jpeg_handle.h
@@ -55,12 +55,12 @@ typedef struct {
 typedef struct {
   struct jpeg_source_mgr pub;
   const unsigned char *data;
-  uint64 datasize;
+  uint64_t datasize;
   bool try_recover_truncated_jpeg;
 } MemSourceMgr;
 
 void SetSrc(j_decompress_ptr cinfo, const void *data,
-            uint64 datasize, bool try_recover_truncated_jpeg);
+            uint64_t datasize, bool try_recover_truncated_jpeg);
 
 // JPEG destination: we will store all the data in a buffer "buffer" of total
 // size "bufsize", if the buffer overflows, we will be in trouble.

diff --git a/dali/operators/decoder/jpeg/jpeg_mem.cc b/dali/operators/decoder/jpeg/jpeg_mem.cc
@@ -73,7 +73,7 @@ bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width,
          flags.crop_x + flags.crop_width <= input_image_width;
 }
 
-std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
+std::unique_ptr<uint8_t[]> UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   // unpack the argball
   const int datasize = argball->datasize_;
   const auto& flags = argball->flags_;
@@ -97,7 +97,7 @@ std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler
 
   // Declare buffers here so that we can free on error paths
   std::unique_ptr<JSAMPLE[]> temp;
-  std::unique_ptr<uint8[]> dstdata;
+  std::unique_ptr<uint8_t[]> dstdata;
   JSAMPLE *tempdata = nullptr;
 
   // Initialize libjpeg structures to have a memory source
@@ -170,9 +170,9 @@ std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler
   // OOM'ing doing the decompress
   jpeg_calc_output_dimensions(&cinfo);
 
-  int64 total_size = static_cast<int64>(cinfo.output_height) *
-                     static_cast<int64>(cinfo.output_width) *
-                     static_cast<int64>(cinfo.num_components);
+  int64_t total_size = static_cast<int64_t>(cinfo.output_height) *
+                     static_cast<int64_t>(cinfo.output_width) *
+                     static_cast<int64_t>(cinfo.num_components);
   // Some of the internal routines do not gracefully handle ridiculously
   // large images, so fail fast.
   if (cinfo.output_width <= 0 || cinfo.output_height <= 0) {
@@ -391,7 +391,7 @@ std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler
   if (components == 4) {
     // Start on the last line.
     JSAMPLE* scanlineptr = static_cast<JSAMPLE*>(
-        dstdata.get() + static_cast<int64>(target_output_height - 1) * stride);
+        dstdata.get() + static_cast<int64_t>(target_output_height - 1) * stride);
     const JSAMPLE kOpaque = -1;  // All ones appropriate for JSAMPLE.
     const int right_rgb = (target_output_width - 1) * 3;
     const int right_rgba = (target_output_width - 1) * 4;
@@ -471,7 +471,7 @@ std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler
     }
 
     const auto full_image = std::move(dstdata);
-    dstdata = std::unique_ptr<uint8[]>(
+    dstdata = std::unique_ptr<uint8_t[]>(
         new JSAMPLE[target_output_width, target_output_height, components]);
     if (dstdata == nullptr) {
       return nullptr;
@@ -490,8 +490,8 @@ std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler
       argball->height_read_ = target_output_height;
     }
     const int crop_offset = flags.crop_x * components * sizeof(JSAMPLE);
-    const uint8* full_image_ptr = full_image.get() + flags.crop_y * full_image_stride;
-    uint8* crop_image_ptr = dstdata.get();
+    const uint8_t* full_image_ptr = full_image.get() + flags.crop_y * full_image_stride;
+    uint8_t* crop_image_ptr = dstdata.get();
     for (int i = 0; i < argball->height_read_; i++) {
       memcpy(crop_image_ptr, full_image_ptr + crop_offset, min_stride);
       crop_image_ptr += stride;
@@ -513,7 +513,7 @@ std::unique_ptr<uint8[]> UncompressLow(const void* srcdata, FewerArgsForCompiler
 //  associated libraries aren't good enough to guarantee that 7
 //  parameters won't get clobbered by the longjmp.  So we help
 //  it out a little.
-std::unique_ptr<uint8[]> Uncompress(const void* srcdata, int datasize,
+std::unique_ptr<uint8_t[]> Uncompress(const void* srcdata, int datasize,
                                     const UncompressFlags& flags) {
   FewerArgsForCompiler argball(datasize, flags);
   auto dstdata = UncompressLow(srcdata, &argball);
@@ -532,7 +532,7 @@ std::unique_ptr<uint8[]> Uncompress(const void* srcdata, int datasize,
   // set the unread pixels to black
   if (argball.height_read_ != argball.height_) {
     const int first_bad_line = argball.height_read_;
-    uint8* start = dstdata.get() + first_bad_line * argball.stride_;
+    uint8_t* start = dstdata.get() + first_bad_line * argball.stride_;
     const int nbytes = (argball.height_ - first_bad_line) * argball.stride_;
     memset(static_cast<void*>(start), 0, nbytes);
   }

diff --git a/dali/operators/decoder/jpeg/jpeg_mem.h b/dali/operators/decoder/jpeg/jpeg_mem.h
@@ -82,7 +82,7 @@ struct UncompressFlags {
 // datasize.
 // The function returns a shared pointer to the uncompressed data or a null pointer if
 // there was an error.
-std::unique_ptr<uint8[]> Uncompress(const void* srcdata, int datasize,
+std::unique_ptr<uint8_t[]> Uncompress(const void* srcdata, int datasize,
                                     const UncompressFlags& flags);
 
 // Read jpeg header and get image information.  Returns true on success.

diff --git a/dali/operators/decoder/peek_shape/peek_image_shape.h b/dali/operators/decoder/peek_shape/peek_image_shape.h
@@ -84,7 +84,7 @@ class PeekImageShape : public StatelessOperator<CPUBackend> {
       thread_pool.AddWork([sample_id, &input, &output, this] (int tid) {
         const auto& image = input[sample_id];
         auto img =
-            ImageFactory::CreateImage(image.data<uint8>(), image.shape().num_elements(), {});
+            ImageFactory::CreateImage(image.data<uint8_t>(), image.shape().num_elements(), {});
         auto shape = img->PeekShape();
         TYPE_SWITCH(output_type_, type2id, type,
                 (int32_t, uint32_t, int64_t, uint64_t, float, double),

diff --git a/dali/operators/generic/flip.cc b/dali/operators/generic/flip.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <vector>
+#include "dali/core/static_switch.h"
+#include "dali/core/error_handling.h"
 #include "dali/operators/generic/flip.h"
 #include "dali/kernels/imgproc/flip_cpu.h"
 #include "dali/kernels/kernel_params.h"
@@ -42,7 +44,7 @@ Flip<CPUBackend>::Flip(const OpSpec &spec)
 void RunFlip(Tensor<CPUBackend> &output, const Tensor<CPUBackend> &input,
              const TensorLayout &layout,
              bool horizontal, bool vertical, bool depthwise) {
-  DALI_TYPE_SWITCH(input.type(), DType,
+  TYPE_SWITCH(input.type(), type2id, DType, (DALI_NUMERIC_TYPES), (
       auto output_ptr = output.mutable_data<DType>();
       auto input_ptr = input.data<DType>();
       auto kernel = kernels::FlipCPU<DType>();
@@ -54,7 +56,7 @@ void RunFlip(Tensor<CPUBackend> &output, const Tensor<CPUBackend> &input,
       auto out_shape = reqs.output_shapes[0][0].to_static<flip_ndim>();
       auto out_view = kernels::OutTensorCPU<DType, flip_ndim>(output_ptr, out_shape);
       kernel.Run(ctx, out_view, in_view, depthwise, vertical, horizontal);
-  )
+  ), (DALI_FAIL(make_string("The element type ", input.type(), " is not supported."))));  // NOLINT
 }
 
 template <>

diff --git a/dali/operators/generic/flip.cu b/dali/operators/generic/flip.cu
@@ -14,6 +14,8 @@
 
 #include <cuda_runtime_api.h>
 #include <vector>
+#include "dali/core/static_switch.h"
+#include "dali/core/error_handling.h"
 #include "dali/kernels/imgproc/flip_gpu.cuh"
 #include "dali/operators/generic/flip.h"
 #include "dali/operators/generic/flip_util.h"
@@ -25,10 +27,9 @@ template <>
 Flip<GPUBackend>::Flip(const OpSpec &spec) : StatelessOperator<GPUBackend>(spec) {}
 
 void RunKernel(TensorList<GPUBackend> &output, const TensorList<GPUBackend> &input,
-               const std::vector<int32> &depthwise, const std::vector<int32> &horizontal,
-               const std::vector<int32> &vertical, cudaStream_t stream) {
-  DALI_TYPE_SWITCH(
-      input.type(), DType,
+               const std::vector<int32_t> &depthwise, const std::vector<int32_t> &horizontal,
+               const std::vector<int32_t> &vertical, cudaStream_t stream) {
+  TYPE_SWITCH(input.type(), type2id, DType, (DALI_NUMERIC_TYPES), (
       auto in_shape = TransformShapes(input.shape(), input.GetLayout());
       auto in_view = reshape<flip_ndim>(view<const DType>(input), in_shape);
       kernels::KernelContext ctx;
@@ -38,7 +39,7 @@ void RunKernel(TensorList<GPUBackend> &output, const TensorList<GPUBackend> &inp
       auto out_shape = reqs.output_shapes[0].to_static<flip_ndim>();
       auto out_view = reshape<flip_ndim>(view<DType>(output), out_shape);
       kernel.Run(ctx, out_view, in_view, depthwise, vertical, horizontal);
-  )
+  ), (DALI_FAIL(make_string("The element type ", input.type(), " is not supported."))));  // NOLINT
 }
 
 template <>

diff --git a/dali/operators/image/paste/paste.cu b/dali/operators/image/paste/paste.cu
@@ -29,9 +29,9 @@ __launch_bounds__(PASTE_BLOCKSIZE, 1)
 void BatchedPaste(
     const int N,
     const int C,
-    const uint8* const __restrict__ fill_value,
-    const uint8* const * const __restrict__ in_batch,
-    uint8* const* const __restrict__ out_batch,
+    const uint8_t* const __restrict__ fill_value,
+    const uint8_t* const * const __restrict__ in_batch,
+    uint8_t* const* const __restrict__ out_batch,
     const int* const __restrict__ in_out_dims_paste_yx) {
   const int n = blockIdx.x;
 
@@ -40,7 +40,7 @@ void BatchedPaste(
   constexpr int nWaves = blockSize / nThreadsPerWave;
   constexpr int MAX_C = 1024;
 
-  __shared__ uint8 rgb[MAX_C];
+  __shared__ uint8_t rgb[MAX_C];
   __shared__ int jump[MAX_C];
   for (int i = threadIdx.x; i < C; i += blockDim.x) {
     rgb[i] = fill_value[i % C];
@@ -55,8 +55,8 @@ void BatchedPaste(
   const int paste_y = in_out_dims_paste_yx[offset + 4];
   const int paste_x = in_out_dims_paste_yx[offset + 5];
 
-  const uint8* const input_ptr = in_batch[n];
-  uint8 * const output_ptr = out_batch[n];
+  const uint8_t* const input_ptr = in_batch[n];
+  uint8_t * const output_ptr = out_batch[n];
 
   __syncthreads();
 
@@ -112,9 +112,9 @@ void Paste<GPUBackend>::RunHelper(Workspace &ws) {
   BatchedPaste<<<curr_batch_size, PASTE_BLOCKSIZE, 0, ws.stream()>>>(
       curr_batch_size,
       C_,
-      fill_value_.template data<uint8>(),
-      input_ptrs_gpu_.template data<const uint8*>(),
-      output_ptrs_gpu_.template data<uint8*>(),
+      fill_value_.template data<uint8_t>(),
+      input_ptrs_gpu_.template data<const uint8_t*>(),
+      output_ptrs_gpu_.template data<uint8_t*>(),
       in_out_dims_paste_yx_gpu_.template data<int>());
 }
 
@@ -173,10 +173,10 @@ void Paste<GPUBackend>::SetupSampleParams(Workspace &ws) {
   output.SetLayout("HWC");
 
   for (int i = 0; i < curr_batch_size; ++i) {
-      input_ptrs_.template mutable_data<const uint8*>()[i] =
-            input.template tensor<uint8>(i);
-      output_ptrs_.template mutable_data<uint8*>()[i] =
-            output.template mutable_tensor<uint8>(i);
+      input_ptrs_.template mutable_data<const uint8_t*>()[i] =
+            input.template tensor<uint8_t>(i);
+      output_ptrs_.template mutable_data<uint8_t*>()[i] =
+            output.template mutable_tensor<uint8_t>(i);
   }
 
   // Copy pointers on the GPU for fast access