refactor: removing the strict_types and max_batch_size apis

peri044 · peri044 · commit b30cbd99fb1e · 2021-12-21T04:03:00.000-08:00
BREAKING CHANGE: This commit removes the strict types and max_batch_size apis. We are doing this because the functionality of these APIs in TRT is convoluted and likely to be ignored during building. A replacement for strict types with actual guarantees will be added at a later date.

Signed-off-by: Dheeraj Peri &lt;peri.dheeraj@gmail.com&gt;
diff --git a/WORKSPACE b/WORKSPACE
@@ -86,10 +86,10 @@ http_archive(
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    sha256 = "3177435024ff4aa5a6dba8c1ed06ab11cc0e1bf3bb712dfa63a43422f41313f3",
-    strip_prefix = "TensorRT-8.0.3.4",
+    sha256 = "da130296ac6636437ff8465812eb55dbab0621747d82dc4fe9b9376f00d214af",
+    strip_prefix = "TensorRT-8.2.2.1",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.0.3/tars/tensorrt-8.0.3.4.linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz",
+        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.2.1/tars/tensorrt-8.2.2.1.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
     ],
 )
 
diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -18,19 +18,12 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
        << "\n    Truncate Long and Double: " << s.truncate_long_and_double                 \
        << "\n    Make Refittable Engine: " << s.refit                                      \
        << "\n    Debuggable Engine: " << s.debug                                           \
-       << "\n    Strict Types: " << s.strict_types                                         \
        << "\n    GPU ID: " << s.device.gpu_id                                              \
        << "\n    Allow GPU Fallback (if running on DLA): " << s.device.allow_gpu_fallback  \
        << "\n    Min Timing Iterations: " << s.num_min_timing_iters                        \
        << "\n    Avg Timing Iterations: " << s.num_avg_timing_iters                        \
        << "\n    Max Workspace Size: " << s.workspace_size;
 
-    if (s.max_batch_size != 0) {
-    os << "\n    Max Batch Size: " << s.max_batch_size;
-    } else {
-    os << "\n    Max Batch Size: Not set";
-    }
-
     os << "\n    Device Type: " << s.device.device_type                                    \
        << "\n    GPU ID: " << s.device.gpu_id;
     if (s.device.device_type == nvinfer1::DeviceType::kDLA) {
@@ -107,18 +100,10 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
     cfg->setFlag(nvinfer1::BuilderFlag::kDEBUG);
   }
 
-  if (settings.strict_types) {
-    cfg->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
-  }
-
   if (settings.device.allow_gpu_fallback) {
     cfg->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
   }
 
-  if (settings.max_batch_size != 0) {
-    builder->setMaxBatchSize(settings.max_batch_size);
-  }
-
   cfg->setMinTimingIterations(settings.num_min_timing_iters);
   cfg->setAvgTimingIterations(settings.num_avg_timing_iters);
   cfg->setMaxWorkspaceSize(settings.workspace_size);
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
@@ -29,15 +29,13 @@ struct BuilderSettings {
   bool disable_tf32 = false;
   bool refit = false;
   bool debug = false;
-  bool strict_types = false;
   bool truncate_long_and_double = false;
   Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
   uint64_t num_min_timing_iters = 2;
   uint64_t num_avg_timing_iters = 1;
   uint64_t workspace_size = 0;
-  uint64_t max_batch_size = 0;
 
   BuilderSettings() = default;
   BuilderSettings(const BuilderSettings& other) = default;
diff --git a/cpp/bin/torchtrtc/README.md b/cpp/bin/torchtrtc/README.md
@@ -31,8 +31,6 @@ OPTIONS:
       --i, --info                       Dumps info messages generated during
                                         compilation onto the console
     --build-debuggable-engine         Creates a debuggable engine
-    --use-strict-types                Restrict operating type to only use set
-                                      operation precision
     --allow-gpu-fallback              (Only used when targeting DLA
                                       (device-type)) Lets engine run layers on
                                       GPU if they are not supported on DLA
@@ -90,8 +88,6 @@ OPTIONS:
                                       used to select kernels
     --workspace-size=[workspace_size] Maximum size of workspace given to
                                       TensorRT
-    --max-batch-size=[max_batch_size] Maximum batch size (must be >= 1 to be
-                                      set, 0 means not set)
     -t[threshold],
     --threshold=[threshold]           Maximum acceptable numerical deviation
                                       from standard torchscript output
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -626,12 +626,6 @@ struct TORCHTRT_API CompileSpec {
    */
   bool truncate_long_and_double = false;
 
-  /**
-   * Restrict operating type to only the lowest enabled operation precision
-   * (enabled_precisions)
-   */
-  bool strict_types = false;
-
   /**
    * Target Device
    */
@@ -656,11 +650,6 @@ struct TORCHTRT_API CompileSpec {
    */
   uint64_t workspace_size = 0;
 
-  /**
-   * Maximum batch size (must be >= 1 to be set, 0 means not set)
-   */
-  uint64_t max_batch_size = 0;
-
   /**
    * Calibration dataloaders for each input for post training quantizatiom
    */
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
@@ -40,9 +40,7 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   internal.convert_info.engine_settings.refit = external.refit;
   internal.convert_info.engine_settings.debug = external.debug;
   internal.convert_info.engine_settings.truncate_long_and_double = external.truncate_long_and_double;
-  internal.convert_info.engine_settings.strict_types = external.strict_types;
   internal.convert_info.engine_settings.device.allow_gpu_fallback = external.device.allow_gpu_fallback;
-  internal.convert_info.engine_settings.max_batch_size = external.max_batch_size;
 
   TORCHTRT_CHECK(
       !(external.require_full_compilation && (external.torch_executed_ops.size() > 0)),
diff --git a/docsrc/tutorials/ptq.rst b/docsrc/tutorials/ptq.rst
@@ -194,7 +194,6 @@ to use ``CacheCalibrator`` to use in INT8 mode.
         "inputs": [torch_tensorrt.Input([1, 3, 32, 32])],
         "enabled_precisions": {torch.float, torch.half, torch.int8},
         "calibrator": calibrator,
-        "max_batch_size": 32,
     }
 
   trt_mod = torch_tensorrt.compile(model, compile_settings)
diff --git a/docsrc/tutorials/torchtrtc.rst b/docsrc/tutorials/torchtrtc.rst
@@ -34,8 +34,6 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
           --i, --info                       Dumps info messages generated during
                                             compilation onto the console
         --build-debuggable-engine         Creates a debuggable engine
-        --use-strict-types                Restrict operating type to only use set
-                                          operation precision
         --allow-gpu-fallback              (Only used when targeting DLA
                                           (device-type)) Lets engine run layers on
                                           GPU if they are not supported on DLA
@@ -93,8 +91,6 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
                                           used to select kernels
         --workspace-size=[workspace_size] Maximum size of workspace given to
                                           TensorRT
-        --max-batch-size=[max_batch_size] Maximum batch size (must be >= 1 to be
-                                          set, 0 means not set)
         -t[threshold],
         --threshold=[threshold]           Maximum acceptable numerical deviation
                                           from standard torchscript output
diff --git a/docsrc/tutorials/use_from_pytorch.rst b/docsrc/tutorials/use_from_pytorch.rst
@@ -38,7 +38,6 @@ at the documentation for the Torch-TensorRT ``TensorRTCompileSpec`` API.
                 "enabled_precisions": {torch.float, torch.half},
                 "refit": False,
                 "debug": False,
-                "strict_types": False,
                 "device": {
                     "device_type": torch_tensorrt.DeviceType.GPU,
                     "gpu_id": 0,
@@ -48,7 +47,6 @@ at the documentation for the Torch-TensorRT ``TensorRTCompileSpec`` API.
                 "capability": torch_tensorrt.EngineCapability.default,
                 "num_min_timing_iters": 2,
                 "num_avg_timing_iters": 1,
-                "max_batch_size": 0,
             })
         }
 
diff --git a/examples/int8/ptq/main.cpp b/examples/int8/ptq/main.cpp
@@ -49,8 +49,6 @@ torch::jit::Module compile_int8_model(const std::string& data_dir, torch::jit::M
   compile_spec.enabled_precisions.insert(torch::kI8);
   /// Use the TensorRT Entropy Calibrator
   compile_spec.ptq_calibrator = calibrator;
-  /// Set max batch size for the engine
-  compile_spec.max_batch_size = 32;
   /// Set a larger workspace
   compile_spec.workspace_size = 1 << 28;
 
diff --git a/examples/int8/qat/main.cpp b/examples/int8/qat/main.cpp
@@ -33,8 +33,6 @@ torch::jit::Module compile_int8_qat_model(const std::string& data_dir, torch::ji
   auto compile_spec = torch_tensorrt::ts::CompileSpec(inputs);
   /// Set operating precision to INT8
   compile_spec.enabled_precisions.insert(torch::kI8);
-  /// Set max batch size for the engine
-  compile_spec.max_batch_size = 32;
   /// Set a larger workspace
   compile_spec.workspace_size = 1 << 28;
 
@@ -126,4 +124,3 @@ int main(int argc, const char* argv[]) {
   print_avg_std_dev("TRT quantized model", trt_runtimes, dims[0][0]);
   trt_mod.save("/tmp/qat_vgg16.trt.ts");
 }
-
diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
@@ -59,14 +59,12 @@ void RegisterTRTCompileSpec() {
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, disable_tf32);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, refit);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, debug);
-  ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, strict_types);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, capability);
   ADD_FIELD_GET_SET_REGISTRATION(
       TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_min_timing_iters);
   ADD_FIELD_GET_SET_REGISTRATION(
       TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_avg_timing_iters);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, workspace_size);
-  ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, max_batch_size);
   ADD_FIELD_GET_SET_REGISTRATION(
       TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, truncate_long_and_double);
 }
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -209,7 +209,6 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() {
   info.convert_info.engine_settings.disable_tf32 = disable_tf32;
   info.convert_info.engine_settings.refit = refit;
   info.convert_info.engine_settings.debug = debug;
-  info.convert_info.engine_settings.strict_types = strict_types;
   info.convert_info.engine_settings.device.device_type = toTRTDeviceType(device.device_type);
   info.convert_info.engine_settings.device.gpu_id = device.gpu_id;
   info.convert_info.engine_settings.device.dla_core = device.dla_core;
@@ -227,8 +226,6 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() {
   info.convert_info.engine_settings.num_avg_timing_iters = num_avg_timing_iters;
   TORCHTRT_CHECK(workspace_size >= 0, "workspace_size must be 0 or greater");
   info.convert_info.engine_settings.workspace_size = workspace_size;
-  TORCHTRT_CHECK(max_batch_size >= 0, "max_batch_size must be 0 or greater");
-  info.convert_info.engine_settings.max_batch_size = max_batch_size;
   return info;
 }
 
@@ -249,13 +246,11 @@ std::string CompileSpec::stringify() {
   ss << "    \"Sparsity\": " << sparse_weights << std::endl;
   ss << "    \"Refit\": " << refit << std::endl;
   ss << "    \"Debug\": " << debug << std::endl;
-  ss << "    \"Strict Types\": " << strict_types << std::endl;
   ss << "    \"Device\": " << device.to_str() << std::endl;
   ss << "    \"Engine Capability\": " << to_str(capability) << std::endl;
   ss << "    \"Num Min Timing Iters\": " << num_min_timing_iters << std::endl;
   ss << "    \"Num Avg Timing Iters\": " << num_avg_timing_iters << std::endl;
   ss << "    \"Workspace Size\": " << workspace_size << std::endl;
-  ss << "    \"Max Batch Size\": " << max_batch_size << std::endl;
   ss << "    \"Truncate long and double\": " << truncate_long_and_double << std::endl;
   ss << "    \"Torch Fallback\": " << torch_fallback.to_str();
   ss << "}";
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -146,13 +146,11 @@ struct CompileSpec : torch::CustomClassHolder {
   ADD_FIELD_GET_SET(sparse_weights, bool);
   ADD_FIELD_GET_SET(refit, bool);
   ADD_FIELD_GET_SET(debug, bool);
-  ADD_FIELD_GET_SET(strict_types, bool);
   ADD_ENUM_GET_SET(capability, EngineCapability, static_cast<int64_t>(EngineCapability::kSAFE_DLA));
   ADD_FIELD_GET_SET(num_min_timing_iters, int64_t);
   ADD_FIELD_GET_SET(num_avg_timing_iters, int64_t);
   ADD_FIELD_GET_SET(workspace_size, int64_t);
   ADD_FIELD_GET_SET(truncate_long_and_double, bool);
-  ADD_FIELD_GET_SET(max_batch_size, int64_t);
   ADD_FIELD_GET_SET(device, Device);
   ADD_FIELD_GET_SET(torch_fallback, TorchFallback);
   ADD_FIELD_GET_SET(ptq_calibrator, nvinfer1::IInt8Calibrator*);
@@ -164,15 +162,13 @@ struct CompileSpec : torch::CustomClassHolder {
   bool disable_tf32 = false;
   bool refit = false;
   bool debug = false;
-  bool strict_types = false;
   bool truncate_long_and_double = false;
   Device device;
   TorchFallback torch_fallback;
   EngineCapability capability = EngineCapability::kDEFAULT;
   int64_t num_min_timing_iters = 2;
   int64_t num_avg_timing_iters = 1;
   int64_t workspace_size = 0;
-  int64_t max_batch_size = 0;
 };
 
 } // namespace pyapi
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -298,13 +298,11 @@ PYBIND11_MODULE(_C, m) {
       .def_readwrite("sparse_weights", &CompileSpec::sparse_weights)
       .def_readwrite("disable_tf32", &CompileSpec::disable_tf32)
       .def_readwrite("debug", &CompileSpec::debug)
-      .def_readwrite("strict_types", &CompileSpec::strict_types)
       .def_readwrite("device", &CompileSpec::device)
       .def_readwrite("capability", &CompileSpec::capability)
       .def_readwrite("num_min_timing_iters", &CompileSpec::num_min_timing_iters)
       .def_readwrite("num_avg_timing_iters", &CompileSpec::num_avg_timing_iters)
       .def_readwrite("workspace_size", &CompileSpec::workspace_size)
-      .def_readwrite("max_batch_size", &CompileSpec::max_batch_size)
       .def_readwrite("torch_fallback", &CompileSpec::torch_fallback)
       .def_readwrite("truncate_long_and_double", &CompileSpec::truncate_long_and_double);
 
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
@@ -196,10 +196,6 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
         assert isinstance(compile_spec["debug"], bool)
         info.debug = compile_spec["debug"]
 
-    if "strict_types" in compile_spec:
-        assert isinstance(compile_spec["strict_types"], bool)
-        info.strict_types = compile_spec["strict_types"]
-
     if "device" in compile_spec:
         info.device = _parse_device(compile_spec["device"])
 
@@ -219,10 +215,6 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
         assert type(compile_spec["workspace_size"]) is int
         info.workspace_size = compile_spec["workspace_size"]
 
-    if "max_batch_size" in compile_spec:
-        assert type(compile_spec["max_batch_size"]) is int
-        info.max_batch_size = compile_spec["max_batch_size"]
-
     if "truncate_long_and_double" in compile_spec:
         assert type(compile_spec["truncate_long_and_double"]) is bool
         info.truncate_long_and_double = compile_spec["truncate_long_and_double"]
@@ -240,12 +232,10 @@ def TensorRTCompileSpec(inputs=[],
                         enabled_precisions=set(),
                         refit=False,
                         debug=False,
-                        strict_types=False,
                         capability=_enums.EngineCapability.default,
                         num_min_timing_iters=2,
                         num_avg_timing_iters=1,
                         workspace_size=0,
-                        max_batch_size=0,
                         truncate_long_and_double=False,
                         calibrator=None) -> torch.classes.tensorrt.CompileSpec:
     """Utility to create a formated spec dictionary for using the PyTorch TensorRT backend
@@ -276,12 +266,10 @@ def TensorRTCompileSpec(inputs=[],
         enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
         refit (bool): Enable refitting
         debug (bool): Enable debuggable engine
-        strict_types (bool): Kernels should strictly run in a particular operating precision. Enabled precision should only have one type in the set
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
         num_min_timing_iters (int): Number of minimization timing iterations used to select kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
-        max_batch_size (int): Maximum batch size (must be >= 1 to be set, 0 means not set)
         truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32
         calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration
 
@@ -298,12 +286,10 @@ def TensorRTCompileSpec(inputs=[],
         "enabled_precisions": enabled_precisions,  # Enabling FP16 kernels
         "refit": refit,  # enable refit
         "debug": debug,  # enable debuggable engine
-        "strict_types": strict_types,  # kernels should strictly run in operating precision
         "capability": capability,  # Restrict kernel selection to safe gpu kernels or safe dla kernels
         "num_min_timing_iters": num_min_timing_iters,  # Number of minimization timing iterations used to select kernels
         "num_avg_timing_iters": num_avg_timing_iters,  # Number of averaging timing iterations used to select kernels
         "workspace_size": workspace_size,  # Maximum size of workspace given to TensorRT
-        "max_batch_size": max_batch_size,  # Maximum batch size (must be >= 1 to be set, 0 means not set)
         "calibrator": calibrator,
         "truncate_long_and_double": truncate_long_and_double
     }
@@ -348,12 +334,10 @@ def TensorRTCompileSpec(inputs=[],
     backend_spec._set_refit(parsed_spec.refit)
     backend_spec._set_debug(parsed_spec.debug)
     backend_spec._set_refit(parsed_spec.refit)
-    backend_spec._set_strict_types(parsed_spec.strict_types)
     backend_spec._set_capability(int(parsed_spec.capability))
     backend_spec._set_num_min_timing_iters(parsed_spec.num_min_timing_iters)
     backend_spec._set_num_avg_timing_iters(parsed_spec.num_avg_timing_iters)
     backend_spec._set_workspace_size(parsed_spec.workspace_size)
-    backend_spec._set_max_batch_size(parsed_spec.max_batch_size)
     backend_spec._set_truncate_long_and_double(parsed_spec.truncate_long_and_double)
     backend_spec._set_ptq_calibrator(parsed_spec._get_calibrator_handle())
 
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
diff --git a/tests/accuracy/test_dla_int8_accuracy.cpp b/tests/accuracy/test_dla_int8_accuracy.cpp
diff --git a/tests/accuracy/test_int8_accuracy.cpp b/tests/accuracy/test_int8_accuracy.cpp
diff --git a/tests/cpp/test_runtime_thread_safety.cpp b/tests/cpp/test_runtime_thread_safety.cpp
diff --git a/tests/py/test_to_backend_api.py b/tests/py/test_to_backend_api.py

Original file line number	Diff line number	Diff line change
@@ -194,7 +194,6 @@ to use ``CacheCalibrator`` to use in INT8 mode.
`194`	`194`	`"inputs": [torch_tensorrt.Input([1, 3, 32, 32])],`
`195`	`195`	`"enabled_precisions": {torch.float, torch.half, torch.int8},`
`196`	`196`	`"calibrator": calibrator,`
`197`		`- "max_batch_size": 32,`
`198`	`197`	`}`
`199`	`198`
`200`	`199`	`trt_mod = torch_tensorrt.compile(model, compile_settings)`