apache · ConvolutedDog · May 14, 2026
diff --git a/python/tvm/target/detect_target.py b/python/tvm/target/detect_target.py
@@ -41,6 +41,7 @@ def _detect_cuda(dev: Device) -> Target:
             "max_threads_per_block": dev.max_threads_per_block,
             "thread_warp_size": dev.warp_size,
             "arch": "sm_" + dev.compute_version.replace(".", ""),
+            "enable_fast_math": False,
         }
     )
 

diff --git a/python/tvm/target/tag_registry/cuda.py b/python/tvm/target/tag_registry/cuda.py
@@ -28,12 +28,14 @@ def _register_cuda_tag(name, arch, shared_mem=49152, regs=65536, **extra):
         "max_threads_per_block": 1024,
         "thread_warp_size": 32,
         "registers_per_block": regs,
+        # Default to disable fast math
+        "enable_fast_math": False,
     }
     config.update(extra)
     register_tag(name, config)
 
 
-def _register_jetson_tag(name, arch, mcpu, num_cores, regs=65536):
+def _register_jetson_tag(name, arch, mcpu, num_cores, regs=65536, enable_fast_math=False):
     register_tag(
         name,
         {
@@ -49,6 +51,7 @@ def _register_jetson_tag(name, arch, mcpu, num_cores, regs=65536):
                 "mcpu": mcpu,
                 "num-cores": num_cores,
             },
+            "enable_fast_math": enable_fast_math,
         },
     )
 

diff --git a/src/target/cuda/intrin_rule_cuda.cc b/src/target/cuda/intrin_rule_cuda.cc
@@ -174,37 +174,46 @@ TVM_REGISTER_OP("tirx.nearbyint")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.exp")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.exp2")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.exp10")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.erf")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.log")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.log2")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.log10")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.tan")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMathTan>);
+    // Now the fast math version of tan and the default version of tan are same.
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMathTan>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.cos")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.cosh")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.sin")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>);
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.sinh")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
@@ -213,12 +222,17 @@ TVM_REGISTER_OP("tirx.atan")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.tanh")
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.sqrt")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
+TVM_REGISTER_OP("tirx.rsqrt")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
+
 TVM_REGISTER_OP("tirx.pow")
+    .set_attr<FLowerIntrinsic>("cuda.fastmath.FLowerIntrinsic", DispatchPureExtern<CUDAFastMath>)
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchPureExtern<CUDAMath>);
 
 TVM_REGISTER_OP("tirx.popcount")

diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
@@ -183,6 +183,14 @@ ffi::Map<ffi::String, ffi::Any> UpdateCUDAAttrs(ffi::Map<ffi::String, ffi::Any>
     }
     target.Set("arch", ffi::String("sm_") + std::to_string(archInt));
   }
+  // Update enable_fast_math
+  if (target.count("enable_fast_math")) {
+    // If enable_fast_math has been specified, validate that enable_fast_math is a bool
+    Downcast<bool>(target.at("enable_fast_math"));
+  } else {
+    // If enable_fast_math has not been specified, default to false
+    target.Set("enable_fast_math", false);
+  }
   return target;
 }
 
@@ -367,6 +375,7 @@ TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
     .add_attr_option<int64_t>("l2_cache_size_bytes")
     .add_attr_option<int64_t>("max_num_threads",
                               refl::DefaultValue(1024))  // TODO(@zxybazh): deprecate it
+    .add_attr_option<bool>("enable_fast_math")
     .set_default_keys({"cuda", "gpu"})
     .set_target_canonicalizer(UpdateCUDAAttrs);
 

diff --git a/src/tirx/transform/lower_intrin.cc b/src/tirx/transform/lower_intrin.cc
@@ -46,11 +46,21 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
   using IRMutatorWithAnalyzer::VisitStmt_;
   using FLowerGeneral = ffi::TypedFunction<PrimExpr(PrimExpr)>;
 
-  IntrinInjecter(arith::Analyzer* analyzer, std::string target, std::string mtriple = "")
-      : IRMutatorWithAnalyzer(analyzer) {
+  IntrinInjecter(arith::Analyzer* analyzer, const Target& tgt) : IRMutatorWithAnalyzer(analyzer) {
+    std::string target = tgt->kind->name;
+    ffi::String mtriple = tgt->GetAttr<ffi::String>("mtriple").value_or("");
+
     std::vector<std::string> patterns;
+    // For CUDA targets, we need to add the fast math patterns if enable_fast_math is true.
+    // The priority of the fast math patterns is higher than the normal patterns.
+    bool is_fast_math = tgt->GetAttr<bool>("enable_fast_math").value_or(false);
+    if (is_fast_math) {
+      patterns.push_back(target + ".fastmath.FLowerIntrinsic");
+      patterns.push_back(target + ".fastmath.FLegalize");
+    }
     patterns.push_back(target + ".FLowerIntrinsic");
     patterns.push_back(target + ".FLegalize");
+
     bool is_llvm_aarch64 = (mtriple.find("aarch64") != std::string::npos);
     if (is_llvm_aarch64) {
       patterns.push_back(target + ".aarch64.FLowerIntrinsic");
@@ -354,7 +364,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
 
 Stmt LowerIntrinStmt(Stmt stmt, const std::string& target) {
   arith::Analyzer analyzer;
-  return IntrinInjecter(&analyzer, target)(std::move(stmt));
+  return IntrinInjecter(&analyzer, Target(ffi::String(target)))(std::move(stmt));
 }
 
 namespace transform {
@@ -365,9 +375,7 @@ Pass LowerIntrin() {
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
     TVM_FFI_ICHECK(target.defined()) << "LowerIntrin: Require the target attribute";
     arith::Analyzer analyzer;
-    auto mtriple = target.value()->GetAttr<ffi::String>("mtriple", "");
-    n->body =
-        IntrinInjecter(&analyzer, target.value()->kind->name, mtriple.value())(std::move(n->body));
+    n->body = IntrinInjecter(&analyzer, target.value())(std::move(n->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tirx.LowerIntrin", {});