diff --git a/xla/lit.bzl b/xla/lit.bzl
index 102ea787ac119..936db23d5c8f3 100644
--- a/xla/lit.bzl
+++ b/xla/lit.bzl
@@ -246,6 +246,8 @@ def lit_test(
         )
         test_file = output_file
 
+    env["LD_LIBRARY_PATH"] = lib_dir
+
     native_test(
         name = name,
         src = lit_name,
diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
index a35ab7cab957a..c166653cd122c 100644
--- a/xla/service/gpu/BUILD
+++ b/xla/service/gpu/BUILD
@@ -146,7 +146,7 @@ cc_library(
 
 xla_test(
     name = "custom_call_test",
-    srcs = if_gpu_is_configured(["custom_call_test.cc"]),
+    srcs = ["custom_call_test.cc"],
     backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
@@ -278,6 +278,7 @@ cc_library(
     name = "ir_emitter_context",
     srcs = ["ir_emitter_context.cc"],
     hdrs = ["ir_emitter_context.h"],
+    tags = ["gpu"],
     deps = [
         ":execution_stream_assignment",
         ":gpu_constants",
@@ -308,6 +309,7 @@ cc_library(
     ]) + if_rocm_hipblaslt([
         "TF_HIPBLASLT=1",
     ]),
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -350,12 +352,14 @@ cc_library(
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:topk_custom_kernel",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
+        "//xla/service/gpu/runtime:cholesky_thunk",
         "//xla/service/gpu/runtime:command_buffer_cmd",
         "//xla/service/gpu/runtime:command_buffer_cmd_emitter",
         "//xla/service/gpu/runtime:command_buffer_thunk",
         "//xla/service/gpu/runtime:conditional_thunk",
         "//xla/service/gpu/runtime:convolution_thunk",
         "//xla/service/gpu/runtime:copy_thunk",
+        "//xla/service/gpu/runtime:cub_sort_thunk",
         "//xla/service/gpu/runtime:cudnn_thunk",
         "//xla/service/gpu/runtime:custom_call_thunk",
         "//xla/service/gpu/runtime:fft_thunk",
@@ -367,6 +371,7 @@ cc_library(
         "//xla/service/gpu/runtime:nccl_all_reduce_thunk",
         "//xla/service/gpu/runtime:nccl_all_to_all_thunk",
         "//xla/service/gpu/runtime:nccl_api",
+        "//xla/service/gpu/runtime:nccl_clique_key",
         "//xla/service/gpu/runtime:nccl_collective_broadcast_thunk",
         "//xla/service/gpu/runtime:nccl_collective_permute_thunk",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
@@ -379,6 +384,7 @@ cc_library(
         "//xla/service/gpu/runtime:send_recv_thunk",
         "//xla/service/gpu/runtime:sequential_thunk",
         "//xla/service/gpu/runtime:thunk",
+        "//xla/service/gpu/runtime:triangular_solve_thunk",
         "//xla/service/gpu/runtime:wait_for_streams_thunk",
         "//xla/service/gpu/runtime:while_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
@@ -390,6 +396,7 @@ cc_library(
         "//xla/service/llvm_ir:sort_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/tsl/protobuf:dnn_proto_cc",
@@ -421,11 +428,7 @@ cc_library(
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:human_readable_json",
         "@tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        "//xla/service/gpu/runtime:cholesky_thunk",
-        "//xla/service/gpu/runtime:cub_sort_thunk",
-        "//xla/service/gpu/runtime:triangular_solve_thunk",
-    ]) + if_rocm_is_configured([
+    ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -442,7 +445,7 @@ cc_library(
         "ir_emitter.h",
         "ir_emitter_nested.h",
     ],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":hlo_to_ir_bindings",
@@ -486,11 +489,9 @@ cc_library(
 
 cc_library(
     name = "triton_call",
-    srcs = if_gpu_is_configured(["triton_call.cc"]),
+    srcs = ["triton_call.cc"],
     hdrs = ["triton_call.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    tags = ["gpu"],
     deps = [
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
@@ -553,15 +554,14 @@ cc_library(
     hdrs = [
         "gpu_executable.h",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":buffer_allocations",
         ":gpu_constants",
         ":gpu_executable_run_options",
         ":ir_emission_utils",
+        ":make_batch_pointers",
         ":stream_executor_util",
         "//xla:executable_run_options",
         "//xla:shape_tree",
@@ -621,13 +621,8 @@ cc_library(
         "@tsl//tsl/profiler/lib:scoped_annotation",
         "@tsl//tsl/profiler/lib:traceme",
     ] + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cublas_plugin",
-        "//xla/stream_executor/cuda:cudnn_plugin",
-        "//xla/stream_executor/cuda:cufft_plugin",
-        "//xla/stream_executor/cuda:stream_executor_cuda",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:stream_executor_rocm",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -766,16 +761,17 @@ cc_library(
 
 build_cub_sort_kernels(
     name = "cub_sort_kernel",
-    srcs = if_gpu_is_configured(["cub_sort_kernel.cu.cc"]),
-    hdrs = if_gpu_is_configured(["cub_sort_kernel.h"]),
+    srcs = ["cub_sort_kernel.cu.cc"],
+    hdrs = ["cub_sort_kernel.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
+    tags = ["gpu"],
     types = get_cub_sort_kernel_types(),
-    deps = if_gpu_is_configured([
+    deps = [
         ":gpu_prim",
         "//xla/stream_executor/gpu:gpu_types_header",
-    ]),
+    ],
 )
 
 cc_library(
@@ -921,9 +917,7 @@ cc_library(
     srcs = ["matmul_utils.cc"],
     hdrs = ["matmul_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":backend_configs_cc",
         ":ir_emission_utils",
@@ -938,6 +932,7 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
+        "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:numeric_options",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "@com_google_absl//absl/algorithm:container",
@@ -950,10 +945,7 @@ cc_library(
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:status",
         "@tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        #keep sorted
-        "//xla/stream_executor:host_or_device_scalar",
-    ]) + if_cuda_is_configured([
+    ] + if_cuda_is_configured([
         #keep sorted
         "//xla/stream_executor/cuda:cublas_lt_header",
         "//xla/stream_executor/cuda:cublas_plugin",
@@ -1034,9 +1026,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1253,7 +1243,7 @@ cc_library(
     hdrs = [
         "compile_module_to_llvm_ir.h",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":executable_proto_cc",
         ":execution_stream_assignment",
@@ -1305,6 +1295,7 @@ cc_library(
     name = "fusion_pipeline",
     srcs = ["fusion_pipeline.cc"],
     hdrs = ["fusion_pipeline.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:xla_proto_cc",
         "//xla/hlo/pass:hlo_pass",
@@ -1353,14 +1344,14 @@ cc_library(
 
 cc_library(
     name = "gpu_compiler",
-    srcs = if_gpu_is_configured([
+    srcs = [
         "gpu_compiler.cc",
-    ]),
-    hdrs = if_gpu_is_configured([
+    ],
+    hdrs = [
         "gpu_compiler.h",
-    ]),
-    deps = if_gpu_is_configured([
-        # go/keep-sorted start prefix_order=":,,
+    ],
+    tags = ["gpu"],
+    deps = [
         ":buffer_sharing",
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
@@ -1387,27 +1378,14 @@ cc_library(
         ":reduction_utils",
         ":runtime_intrinsics",
         ":stream_executor_util",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-        "@llvm-project//llvm:AsmParser",
-        "@llvm-project//llvm:BitReader",
-        "@llvm-project//llvm:BitWriter",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TransformUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
+        "//xla:autotune_results_proto_cc",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/pass:hlo_pass",
@@ -1415,58 +1393,6 @@ cc_library(
         "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
         "//xla/hlo/translate/mhlo_to_hlo:location_exporter",
         "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/service/gpu/autotuning:autotuner_util",
-        "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner",
-        "//xla/service/gpu/fusions/triton:triton_support",
-        "//xla/service/gpu/model:gpu_cost_model_stats_collection",
-        "//xla/service/gpu/model:gpu_hlo_cost_analysis",
-        "//xla/service/gpu/runtime:thunk",
-        "//xla/service/gpu/transforms:algebraic_simplifier",
-        "//xla/service/gpu/transforms:algorithm_checker",
-        "//xla/service/gpu/transforms:all_gather_optimizer",
-        "//xla/service/gpu/transforms:all_reduce_blueconnect",
-        "//xla/service/gpu/transforms:all_reduce_splitter",
-        "//xla/service/gpu/transforms:async_collective_annotator",
-        "//xla/service/gpu/transforms:async_wrapper",
-        "//xla/service/gpu/transforms:collective_permute_cycle_decomposer",
-        "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator",
-        "//xla/service/gpu/transforms:command_buffer_scheduling",
-        "//xla/service/gpu/transforms:conv_rewriter",
-        "//xla/service/gpu/transforms:convert_async_collectives_to_sync",
-        "//xla/service/gpu/transforms:cudnn_custom_call_converter",
-        "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
-        "//xla/service/gpu/transforms:dot_dimension_sorter",
-        "//xla/service/gpu/transforms:dot_operand_converter",
-        "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
-        "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
-        "//xla/service/gpu/transforms:fusion_wrapper",
-        "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter",
-        "//xla/service/gpu/transforms:gemm_fusion",
-        "//xla/service/gpu/transforms:gemm_rewriter",
-        "//xla/service/gpu/transforms:gemv_rewriter",
-        "//xla/service/gpu/transforms:layout_assignment",
-        "//xla/service/gpu/transforms:move_copy_to_users",
-        "//xla/service/gpu/transforms:pipelined_p2p_rewriter",
-        "//xla/service/gpu/transforms:reduce_scatter_creator",
-        "//xla/service/gpu/transforms:reduction_degenerate_dim_remover",
-        "//xla/service/gpu/transforms:reduction_dimension_grouper",
-        "//xla/service/gpu/transforms:reduction_layout_normalizer",
-        "//xla/service/gpu/transforms:reduction_splitter",
-        "//xla/service/gpu/transforms:rename_fusions",
-        "//xla/service/gpu/transforms:sanitize_constant_names",
-        "//xla/service/gpu/transforms:scatter_expander",
-        "//xla/service/gpu/transforms:scatter_slice_simplifier",
-        "//xla/service/gpu/transforms:softmax_rewriter_triton",
-        "//xla/service/gpu/transforms:stream_attribute_annotator",
-        "//xla/service/gpu/transforms:stream_attribute_async_wrapper",
-        "//xla/service/gpu/transforms:topk_specializer",
-        "//xla/service/gpu/transforms:topk_splitter",
-        "//xla/service/gpu/transforms:transpose_dimension_grouper",
-        "//xla/service/gpu/transforms:tree_reduction_rewriter",
-        "//xla/service/gpu/transforms:triton_fusion_numerics_verifier",
-        "//xla/service/gpu/transforms:windowed_einsum_handler",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/service/spmd:collective_permute_motion",
         "//xla/service:algebraic_simplifier",
         "//xla/service:all_gather_broadcast_reorder",
         "//xla/service:all_gather_combiner",
@@ -1565,23 +1491,86 @@ cc_library(
         "//xla/service:while_loop_simplifier",
         "//xla/service:while_loop_trip_count_annotator",
         "//xla/service:zero_sized_hlo_elimination",
+        "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner",
+        "//xla/service/gpu/fusions/triton:triton_support",
+        "//xla/service/gpu/model:gpu_cost_model_stats_collection",
+        "//xla/service/gpu/model:gpu_hlo_cost_analysis",
+        "//xla/service/gpu/runtime:thunk",
+        "//xla/service/gpu/transforms:algebraic_simplifier",
+        "//xla/service/gpu/transforms:algorithm_checker",
+        "//xla/service/gpu/transforms:all_gather_optimizer",
+        "//xla/service/gpu/transforms:all_reduce_blueconnect",
+        "//xla/service/gpu/transforms:all_reduce_splitter",
+        "//xla/service/gpu/transforms:async_collective_annotator",
+        "//xla/service/gpu/transforms:async_wrapper",
+        "//xla/service/gpu/transforms:collective_permute_cycle_decomposer",
+        "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator",
+        "//xla/service/gpu/transforms:command_buffer_scheduling",
+        "//xla/service/gpu/transforms:conv_rewriter",
+        "//xla/service/gpu/transforms:convert_async_collectives_to_sync",
+        "//xla/service/gpu/transforms:cudnn_custom_call_converter",
+        "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
+        "//xla/service/gpu/transforms:dot_dimension_sorter",
+        "//xla/service/gpu/transforms:dot_operand_converter",
+        "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
+        "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
+        "//xla/service/gpu/transforms:fusion_wrapper",
+        "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter",
+        "//xla/service/gpu/transforms:gemm_fusion",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:gemv_rewriter",
+        "//xla/service/gpu/transforms:layout_assignment",
+        "//xla/service/gpu/transforms:move_copy_to_users",
+        "//xla/service/gpu/transforms:pipelined_p2p_rewriter",
+        "//xla/service/gpu/transforms:reduce_scatter_creator",
+        "//xla/service/gpu/transforms:reduction_degenerate_dim_remover",
+        "//xla/service/gpu/transforms:reduction_dimension_grouper",
+        "//xla/service/gpu/transforms:reduction_layout_normalizer",
+        "//xla/service/gpu/transforms:reduction_splitter",
+        "//xla/service/gpu/transforms:rename_fusions",
+        "//xla/service/gpu/transforms:sanitize_constant_names",
+        "//xla/service/gpu/transforms:scatter_expander",
+        "//xla/service/gpu/transforms:scatter_slice_simplifier",
+        "//xla/service/gpu/transforms:softmax_rewriter_triton",
+        "//xla/service/gpu/transforms:stream_attribute_annotator",
+        "//xla/service/gpu/transforms:stream_attribute_async_wrapper",
+        "//xla/service/gpu/transforms:topk_specializer",
+        "//xla/service/gpu/transforms:topk_splitter",
+        "//xla/service/gpu/transforms:transpose_dimension_grouper",
+        "//xla/service/gpu/transforms:tree_reduction_rewriter",
+        "//xla/service/gpu/transforms:triton_fusion_numerics_verifier",
+        "//xla/service/gpu/transforms:windowed_einsum_handler",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/spmd:collective_permute_motion",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:semantic_version",
         "//xla/tsl/lib/monitoring:counter",
-        "//xla:autotune_results_proto_cc",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:BitReader",
+        "@llvm-project//llvm:BitWriter",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@tsl//tsl/platform:blocking_counter",
         "@tsl//tsl/platform:casts",
         "@tsl//tsl/platform:env",
@@ -1594,8 +1583,7 @@ cc_library(
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/profiler/lib:scoped_annotation",
         "@tsl//tsl/profiler/lib:traceme",
-        # go/keep-sorted end
-    ]) + xla_internal(["service:export_hlo"]) + if_google([
+    ] + xla_internal(["service:export_hlo"]) + if_google([
         "//xla/hlo/experimental/auto_sharding",
     ]),
 )
@@ -1909,12 +1897,9 @@ xla_test(
 
 xla_cc_test(
     name = "gpu_aot_compilation_test",
-    srcs = if_gpu_is_configured([
+    srcs = [
         "gpu_aot_compilation_test.cc",
-    ]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    ],
     tags = [
         "gpu",
         "no_oss",
@@ -1957,6 +1942,7 @@ cc_library(
     tags = [
         "gpu",
         "manual",
+        "rocm-only",
     ],
     deps = [
         ":amdgpu_compiler_impl",
@@ -1977,6 +1963,7 @@ cc_library(
     tags = [
         "gpu",
         "manual",
+        "rocm-only",
     ],
     deps = [
         ":cublas_padding_requirements",
@@ -2021,13 +2008,11 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:ir_headers",
+        "@local_config_rocm//rocm:rocm_headers",
         "@tsl//tsl/platform:env",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:statusor",
-    ] + if_rocm_is_configured([
-        # keep sorted
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ],
 )
 
 cc_library(
@@ -2076,6 +2061,7 @@ cc_library(
     name = "gpu_hlo_schedule",
     srcs = ["gpu_hlo_schedule.cc"],
     hdrs = ["gpu_hlo_schedule.h"],
+    tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
         ":gpu_latency_hiding_scheduler",
@@ -2386,13 +2372,13 @@ xla_cc_test(
 
 cc_library(
     name = "buffer_comparator",
-    srcs = if_gpu_is_configured(["buffer_comparator.cc"]),
-    hdrs = if_gpu_is_configured(["buffer_comparator.h"]),
+    srcs = ["buffer_comparator.cc"],
+    hdrs = ["buffer_comparator.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    deps = if_gpu_is_configured([
-        # keep sorted
+    tags = ["gpu"],
+    deps = [
         ":buffer_comparator_kernel",
         ":gpu_asm_opts_util",
         ":launch_dimensions",
@@ -2412,7 +2398,7 @@ cc_library(
         "@tsl//tsl/platform:logging",
         "@tsl//tsl/platform:ml_dtypes",
         "@tsl//tsl/platform:statusor",
-    ]) + if_rocm_is_configured([
+    ] + if_rocm_is_configured([
         # keep sorted
         "@local_config_rocm//rocm:rocm_headers",
     ]),
@@ -2420,7 +2406,7 @@ cc_library(
 
 gpu_kernel_library(
     name = "buffer_comparator_kernel",
-    srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]),
+    srcs = ["buffer_comparator.cu.cc"],
     copts = rocm_copts(),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
@@ -2434,28 +2420,24 @@ gpu_kernel_library(
 
 xla_test(
     name = "buffer_comparator_test",
-    srcs = if_gpu_is_configured(["buffer_comparator_test.cc"]),
+    srcs = ["buffer_comparator_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
+        ":buffer_comparator",
         ":stream_executor_util",
         "//xla:shape_util",
         "//xla:types",
-        "//xla/service:hlo_module_config",
+        "//xla/service:platform_util",
         "//xla/stream_executor",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:platform_manager",
+        "@com_google_absl//absl/strings",
         "@tsl//tsl/platform:ml_dtypes",
         "@tsl//tsl/platform:status",
         "@tsl//tsl/platform:test",
         "@tsl//tsl/platform:test_main",
-    ] + if_gpu_is_configured([
-        ":buffer_comparator",
-        "//xla/stream_executor:device_memory",
-    ]),
+    ],
 )
 
 cc_library(
@@ -2630,8 +2612,9 @@ cc_library(
 
 cc_library(
     name = "make_batch_pointers",
-    srcs = if_gpu_is_configured(["make_batch_pointers.cc"]),
-    hdrs = if_gpu_is_configured(["make_batch_pointers.h"]),
+    srcs = ["make_batch_pointers.cc"],
+    hdrs = ["make_batch_pointers.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:types",
         "//xla:util",
@@ -2652,7 +2635,8 @@ cc_library(
 
 cuda_library(
     name = "make_batch_pointers_kernel",
-    srcs = if_cuda_is_configured(["make_batch_pointers.cu.cc"]),
+    srcs = ["make_batch_pointers.cu.cc"],
+    tags = ["cuda-only"],
     deps = [
         "@local_config_cuda//cuda:cuda_headers",  # build_cleaner: keep
     ],
@@ -2948,6 +2932,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_latency_hiding_scheduler_test",
     srcs = ["gpu_latency_hiding_scheduler_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
diff --git a/xla/service/gpu/buffer_comparator_test.cc b/xla/service/gpu/buffer_comparator_test.cc
index cbef669abab7b..99a280af12659 100644
--- a/xla/service/gpu/buffer_comparator_test.cc
+++ b/xla/service/gpu/buffer_comparator_test.cc
@@ -21,9 +21,10 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/hlo_module_config.h"
+#include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_handle.h"
@@ -44,13 +45,11 @@ constexpr double kDefaultTolerance = 0.1;
 class BufferComparatorTest : public testing::Test {
  protected:
   BufferComparatorTest()
-#if GOOGLE_CUDA
-      : platform_(se::PlatformManager::PlatformWithName("CUDA").value()),
-#elif TENSORFLOW_USE_ROCM
-      : platform_(se::PlatformManager::PlatformWithName("ROCM").value()),
-#endif
-        stream_exec_(platform_->ExecutorForDevice(0).value()) {
-  }
+      : platform_(stream_executor::PlatformManager::PlatformWithName(
+                      absl::AsciiStrToUpper(
+                          PlatformUtil::CanonicalPlatformName("gpu").value()))
+                      .value()),
+        stream_exec_(platform_->ExecutorForDevice(0).value()) {}
 
   // Take floats only for convenience. Still uses ElementType internally.
   template <typename ElementType>
diff --git a/xla/service/gpu/fusions/BUILD b/xla/service/gpu/fusions/BUILD
index f75f04c0f58c4..15d96fa51193c 100644
--- a/xla/service/gpu/fusions/BUILD
+++ b/xla/service/gpu/fusions/BUILD
@@ -12,6 +12,7 @@ cc_library(
     name = "in_place_dynamic_update_slice_mlir",
     srcs = ["in_place_dynamic_update_slice_mlir.cc"],
     hdrs = ["in_place_dynamic_update_slice_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -39,6 +40,7 @@ cc_library(
     name = "copy",
     srcs = ["copy.cc"],
     hdrs = ["copy.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
@@ -61,6 +63,7 @@ cc_library(
     srcs = ["custom.cc"],
     hdrs = ["custom.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla:literal",
@@ -168,6 +171,7 @@ cc_library(
     name = "fusion_emitter",
     srcs = ["fusion_emitter.cc"],
     hdrs = ["fusion_emitter.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:shape_util",
@@ -204,6 +208,7 @@ cc_library(
     name = "fusions",
     srcs = ["fusions.cc"],
     hdrs = ["fusions.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         ":concatenate_mlir",
@@ -245,6 +250,7 @@ cc_library(
     name = "loop_mlir",
     srcs = ["loop_mlir.cc"],
     hdrs = ["loop_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -274,6 +280,7 @@ cc_library(
     name = "scatter_mlir",
     srcs = ["scatter_mlir.cc"],
     hdrs = ["scatter_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -306,6 +313,7 @@ cc_library(
     name = "transpose_mlir",
     srcs = ["transpose_mlir.cc"],
     hdrs = ["transpose_mlir.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla:permutation_util",
@@ -340,6 +348,7 @@ cc_library(
     name = "triton",
     srcs = ["triton.cc"],
     hdrs = ["triton.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         ":fusion_emitter",
@@ -379,7 +388,7 @@ cc_library(
 xla_cc_test(
     name = "triton_test",
     srcs = ["triton_test.cc"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         ":fusions",
@@ -403,6 +412,7 @@ cc_library(
     srcs = ["cudnn.cc"],
     hdrs = ["cudnn.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         "//xla/hlo/ir:hlo",
@@ -463,6 +473,7 @@ cc_library(
     name = "thunk_util",
     srcs = ["thunk_util.cc"],
     hdrs = ["thunk_util.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:literal",
@@ -482,6 +493,7 @@ cc_library(
     name = "reduction_base",
     srcs = ["reduction_base.cc"],
     hdrs = ["reduction_base.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         ":fusion_emitter",
@@ -517,6 +529,7 @@ cc_library(
     name = "reduction_mlir",
     srcs = ["reduction_mlir.cc"],
     hdrs = ["reduction_mlir.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         ":reduction_base",
@@ -558,6 +571,7 @@ cc_library(
 xla_cc_test(
     name = "reduction_base_test",
     srcs = ["reduction_base_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":reduction_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -572,6 +586,7 @@ cc_library(
     name = "concatenate_mlir",
     srcs = ["concatenate_mlir.cc"],
     hdrs = ["concatenate_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -598,6 +613,7 @@ cc_library(
     name = "input_slices_mlir",
     srcs = ["input_slices_mlir.cc"],
     hdrs = ["input_slices_mlir.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:util",
         "//xla:xla_data_proto_cc",
diff --git a/xla/service/gpu/fusions/ir/tests/BUILD b/xla/service/gpu/fusions/ir/tests/BUILD
index 381d5a3220b1d..6a1e332a2e9aa 100644
--- a/xla/service/gpu/fusions/ir/tests/BUILD
+++ b/xla/service/gpu/fusions/ir/tests/BUILD
@@ -9,6 +9,7 @@ lit_test_suite(
     name = "tests",
     srcs = glob(["*.mlir"]),
     cfg = "//xla:lit.cfg.py",
+    tags = ["gpu"],
     tools = [
         "//xla/service/gpu/fusions/tools:mlir_fusions_opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/xla/service/gpu/fusions/ir/tests/inlining.mlir b/xla/service/gpu/fusions/ir/tests/inlining.mlir
index f15b37b040b84..bc721d4b7b4bf 100644
--- a/xla/service/gpu/fusions/ir/tests/inlining.mlir
+++ b/xla/service/gpu/fusions/ir/tests/inlining.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir_fusions_opt %s -split-input-file -xla-erase-dead-functions -inline | FileCheck %s
+// RUN: env LD_DEBUG=files,libs mlir_fusions_opt %s -split-input-file -xla-erase-dead-functions -inline | FileCheck %s
 
 module {
   func.func private @mul(%a: f32, %b: f32) -> f32 {
diff --git a/xla/service/gpu/fusions/legacy/BUILD b/xla/service/gpu/fusions/legacy/BUILD
index 5c1705714483c..63bed338ca919 100644
--- a/xla/service/gpu/fusions/legacy/BUILD
+++ b/xla/service/gpu/fusions/legacy/BUILD
@@ -10,6 +10,7 @@ cc_library(
     name = "in_place_dynamic_update_slice",
     srcs = ["in_place_dynamic_update_slice.cc"],
     hdrs = ["in_place_dynamic_update_slice.h"],
+    tags = ["gpu"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -33,6 +34,7 @@ cc_library(
 xla_cc_test(
     name = "in_place_dynamic_update_slice_test",
     srcs = ["in_place_dynamic_update_slice_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":in_place_dynamic_update_slice",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -53,6 +55,7 @@ cc_library(
     name = "loop",
     srcs = ["loop.cc"],
     hdrs = ["loop.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:util",
@@ -83,6 +86,7 @@ cc_library(
 xla_cc_test(
     name = "loop_test",
     srcs = ["loop_test.cc"],
+    tags = ["gpu"],
     deps = [
         "//xla:status_macros",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -105,6 +109,7 @@ cc_library(
     name = "scatter",
     srcs = ["scatter.cc"],
     hdrs = ["scatter.h"],
+    tags = ["gpu"],
     deps = [
         ":loop",
         "//xla:shape_util",
@@ -135,6 +140,7 @@ cc_library(
 xla_cc_test(
     name = "scatter_test",
     srcs = ["scatter_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":scatter",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -155,6 +161,7 @@ cc_library(
     name = "tiling_util",
     srcs = ["tiling_util.cc"],
     hdrs = ["tiling_util.h"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:shape_util",
@@ -183,6 +190,7 @@ cc_library(
     name = "reduction",
     srcs = ["reduction.cc"],
     hdrs = ["reduction.h"],
+    tags = ["gpu"],
     deps = [
         ":tiling_util",
         "//xla:shape_util",
@@ -236,6 +244,7 @@ cc_library(
 xla_cc_test(
     name = "reduction_test",
     srcs = ["reduction_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":reduction",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -255,6 +264,7 @@ cc_library(
     name = "concatenate",
     srcs = ["concatenate.cc"],
     hdrs = ["concatenate.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -282,6 +292,7 @@ cc_library(
 xla_cc_test(
     name = "concatenate_test",
     srcs = ["concatenate_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":concatenate",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -301,6 +312,7 @@ cc_library(
     name = "transpose",
     srcs = ["transpose.cc"],
     hdrs = ["transpose.h"],
+    tags = ["gpu"],
     deps = [
         ":tiling_util",
         "//xla:permutation_util",
@@ -335,6 +347,7 @@ cc_library(
 xla_cc_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":transpose",
         "//xla:status_macros",
@@ -357,6 +370,7 @@ cc_library(
     name = "input_slices",
     srcs = ["input_slices.cc"],
     hdrs = ["input_slices.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:util",
@@ -388,6 +402,7 @@ cc_library(
 xla_cc_test(
     name = "input_slices_test",
     srcs = ["input_slices_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":input_slices",
         "//xla/service/gpu:gpu_device_info_for_tests",
diff --git a/xla/service/gpu/fusions/mlir/BUILD b/xla/service/gpu/fusions/mlir/BUILD
index 29764a403dbd9..6b02be5bc37c5 100644
--- a/xla/service/gpu/fusions/mlir/BUILD
+++ b/xla/service/gpu/fusions/mlir/BUILD
@@ -17,6 +17,7 @@ cc_library(
     name = "computation_partitioner",
     srcs = ["computation_partitioner.cc"],
     hdrs = ["computation_partitioner.h"],
+    tags = ["gpu"],
     deps = [
         ":type_util",
         "//xla:shape_util",
@@ -47,6 +48,7 @@ cc_library(
 xla_cc_test(
     name = "computation_partitioner_test",
     srcs = ["computation_partitioner_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         "//xla/hlo/ir:hlo",
@@ -64,6 +66,7 @@ cc_library(
     name = "elemental_hlo_to_mlir",
     srcs = ["elemental_hlo_to_mlir.cc"],
     hdrs = ["elemental_hlo_to_mlir.h"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":type_util",
@@ -113,6 +116,7 @@ cc_library(
 xla_cc_test(
     name = "elemental_hlo_to_mlir_test",
     srcs = ["elemental_hlo_to_mlir_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":elemental_hlo_to_mlir",
@@ -153,6 +157,7 @@ cc_library(
     name = "mlir_fusion_emitter",
     srcs = ["mlir_fusion_emitter.cc"],
     hdrs = ["mlir_fusion_emitter.h"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":elemental_hlo_to_mlir",
@@ -231,6 +236,7 @@ cc_library(
 xla_cc_test(
     name = "mlir_fusion_emitter_test",
     srcs = ["mlir_fusion_emitter_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":computation_partitioner",
         ":mlir_fusion_emitter",
diff --git a/xla/service/gpu/fusions/tests/BUILD b/xla/service/gpu/fusions/tests/BUILD
index d3e3b665e75d3..71bec2f5d4472 100644
--- a/xla/service/gpu/fusions/tests/BUILD
+++ b/xla/service/gpu/fusions/tests/BUILD
@@ -10,6 +10,7 @@ lit_test_suite(
     srcs = glob(["**/*.hlo"]),
     cfg = "//xla:lit.cfg.py",
     default_tags = ["requires-gpu-sm80-only"],
+    tags = ["gpu"],
     tools = [
         "//xla/service/gpu/fusions/tools:fusion_to_mlir",
         "//xla/service/gpu/fusions/tools:mlir_fusions_opt",
diff --git a/xla/service/gpu/fusions/tools/BUILD b/xla/service/gpu/fusions/tools/BUILD
index 5b6c18f409287..7d1abdcb0456b 100644
--- a/xla/service/gpu/fusions/tools/BUILD
+++ b/xla/service/gpu/fusions/tools/BUILD
@@ -8,6 +8,7 @@ package(
 xla_cc_binary(
     name = "mlir_fusions_opt",
     srcs = ["mlir_fusions_opt.cc"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         "//xla/mlir_hlo",
@@ -41,6 +42,7 @@ cc_library(
     testonly = 1,
     srcs = ["test_lib.cc"],
     hdrs = ["test_lib.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
@@ -75,6 +77,7 @@ xla_cc_binary(
     name = "fusion_to_mlir",
     testonly = 1,
     srcs = ["fusion_to_mlir.cc"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         ":test_lib",
@@ -90,6 +93,7 @@ xla_cc_binary(
     name = "test_correctness",
     testonly = 1,
     srcs = ["test_correctness.cc"],
+    tags = ["gpu"],
     visibility = ["//xla/service/gpu/fusions:__subpackages__"],
     deps = [
         ":test_lib",
diff --git a/xla/service/gpu/fusions/transforms/BUILD b/xla/service/gpu/fusions/transforms/BUILD
index b26e0ec1ab9cb..18ad748f86743 100644
--- a/xla/service/gpu/fusions/transforms/BUILD
+++ b/xla/service/gpu/fusions/transforms/BUILD
@@ -53,6 +53,7 @@ cc_library(
         "vectorize_loads_stores.cc",
     ],
     hdrs = ["passes.h"],
+    tags = ["gpu"],
     deps = [
         ":passes_inc_gen",
         "//xla:shape_util",
diff --git a/xla/service/gpu/fusions/transforms/tests/BUILD b/xla/service/gpu/fusions/transforms/tests/BUILD
index 381d5a3220b1d..6a1e332a2e9aa 100644
--- a/xla/service/gpu/fusions/transforms/tests/BUILD
+++ b/xla/service/gpu/fusions/transforms/tests/BUILD
@@ -9,6 +9,7 @@ lit_test_suite(
     name = "tests",
     srcs = glob(["*.mlir"]),
     cfg = "//xla:lit.cfg.py",
+    tags = ["gpu"],
     tools = [
         "//xla/service/gpu/fusions/tools:mlir_fusions_opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/xla/service/gpu/fusions/triton/BUILD b/xla/service/gpu/fusions/triton/BUILD
index 65acf4e075e8f..74fb1decf7659 100644
--- a/xla/service/gpu/fusions/triton/BUILD
+++ b/xla/service/gpu/fusions/triton/BUILD
@@ -34,6 +34,7 @@ cc_library(
         "compilation_pipeline_rocm.cc",
     ]),
     hdrs = ["triton_fusion_emitter.h"],
+    tags = ["gpu"],
     deps = [
         ":passes",
         "//xla:autotuning_proto_cc",
@@ -320,6 +321,7 @@ cc_library(
     testonly = True,
     srcs = ["triton_test_utils.cc"],
     hdrs = ["triton_test_utils.h"],
+    tags = ["gpu"],
     deps = [
         ":triton_fusion_emitter",
         "//xla:shape_util",
@@ -357,6 +359,7 @@ cc_library(
 xla_cc_test(
     name = "triton_fusion_emitter_mem_utils_test",
     srcs = if_cuda_is_configured(["triton_fusion_emitter_mem_utils_test.cc"]),
+    tags = ["gpu"],
     deps = [
         ":triton_fusion_emitter",
         "//xla/hlo/ir:hlo",
diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
index e43835397d095..932dc31cc5f9b 100644
--- a/xla/service/gpu/gpu_executable.cc
+++ b/xla/service/gpu/gpu_executable.cc
@@ -72,9 +72,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_executor.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -808,12 +806,10 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
   se::StreamExecutor* executor = run_options->stream()->parent();
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // GpuExecutable always bound to a single GpuContext during its execution, so
   // we activate it once to skip expensive context activations later.
   se::gpu::GpuExecutor* gpu_executor = se::gpu::ExtractGpuExecutor(executor);
   se::gpu::ScopedActivateContext activation(gpu_executor);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Force synchronous execution if the allocator requires it.
   const bool block_host_until_done =
diff --git a/xla/service/gpu/kernels/BUILD b/xla/service/gpu/kernels/BUILD
index c155276748894..4bde4d38cebe3 100644
--- a/xla/service/gpu/kernels/BUILD
+++ b/xla/service/gpu/kernels/BUILD
@@ -1,12 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load(
     "@tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load("//xla:xla.bzl", "xla_cc_binary")
 load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "DEFAULT_DISABLED_BACKENDS", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_windows")
 
@@ -210,12 +208,11 @@ cc_library(
     name = "topk_custom_kernel",
     srcs = ["topk_custom_kernel.cc"],
     hdrs = ["topk_custom_kernel.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    tags = ["gpu"],
     visibility = [":friends"],
     deps = [
         ":custom_kernel",
+        ":topk_kernel_gpu",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
@@ -225,9 +222,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        ":topk_kernel_gpu",
-    ]),
+    ],
 )
 
 xla_test(
diff --git a/xla/service/gpu/kernels/topk_custom_kernel.cc b/xla/service/gpu/kernels/topk_custom_kernel.cc
index 2be74bf301ee1..a2611258acc10 100644
--- a/xla/service/gpu/kernels/topk_custom_kernel.cc
+++ b/xla/service/gpu/kernels/topk_custom_kernel.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/topk_kernel_common.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
@@ -35,14 +36,8 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
-#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-#include "xla/service/gpu/kernels/topk_kernel_common.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 namespace xla::gpu::kernel::topk {
 
-#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-
 namespace {
 
 using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
@@ -135,16 +130,4 @@ absl::StatusOr<CustomKernel> GetTopKKernel(std::string name,
   }
 }
 
-#else
-
-// Fallback implementation of creating a CustomKernel for TopK operation.
-absl::StatusOr<CustomKernel> GetTopKKernel(std::string name,
-                                           PrimitiveType dtype,
-                                           size_t num_elements, size_t k,
-                                           size_t batch_size) {
-  return absl::InternalError("XLA compiled without CUDA support");
-}
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 }  // namespace xla::gpu::kernel::topk
diff --git a/xla/service/gpu/matmul_utils.h b/xla/service/gpu/matmul_utils.h
index 5f128e418af58..02605ea0c9a6a 100644
--- a/xla/service/gpu/matmul_utils.h
+++ b/xla/service/gpu/matmul_utils.h
@@ -36,10 +36,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/xla_data.pb.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#endif
-
 namespace xla {
 namespace gpu {
 
diff --git a/xla/service/gpu/model/BUILD b/xla/service/gpu/model/BUILD
index 756a7a3330440..ebb795cd6433b 100644
--- a/xla/service/gpu/model/BUILD
+++ b/xla/service/gpu/model/BUILD
@@ -25,6 +25,7 @@ cc_library(
     name = "analytical_latency_estimator",
     srcs = ["analytical_latency_estimator.cc"],
     hdrs = ["analytical_latency_estimator.h"],
+    tags = ["gpu"],
     deps = [
         ":gpu_collective_performance_model",
         ":gpu_hlo_cost_analysis",
@@ -102,6 +103,7 @@ cc_library(
     name = "gpu_cost_model_stats_collection",
     srcs = ["gpu_cost_model_stats_collection.cc"],
     hdrs = ["gpu_cost_model_stats_collection.h"],
+    tags = ["gpu"],
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
@@ -121,6 +123,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_cost_model_stats_collection_test",
     srcs = ["gpu_cost_model_stats_collection_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
@@ -191,6 +194,7 @@ cc_library(
     name = "gpu_performance_model_base",
     srcs = ["gpu_performance_model_base.cc"],
     hdrs = ["gpu_performance_model_base.h"],
+    tags = ["gpu"],
     deps = [
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
@@ -219,6 +223,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_performance_model_base_test",
     srcs = ["gpu_performance_model_base_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
@@ -241,6 +246,7 @@ cc_library(
     name = "gpu_performance_model",
     srcs = ["gpu_performance_model.cc"],
     hdrs = ["gpu_performance_model.h"],
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":gpu_hlo_cost_analysis",
@@ -263,6 +269,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_performance_model_test",
     srcs = ["gpu_performance_model_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
@@ -294,6 +301,7 @@ cc_library(
     srcs = ["gpu_collective_performance_model.cc"],
     hdrs = ["gpu_collective_performance_model.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
@@ -332,6 +340,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_collective_performance_model_test",
     srcs = ["gpu_collective_performance_model_test.cc"],
+    tags = ["gpu"],
     deps = [
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tests:hlo_test_base",
@@ -344,6 +353,7 @@ cc_library(
     name = "gpu_indexing_performance_model",
     srcs = ["gpu_indexing_performance_model.cc"],
     hdrs = ["gpu_indexing_performance_model.h"],
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
@@ -384,6 +394,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_indexing_performance_model_test",
     srcs = ["gpu_indexing_performance_model_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
@@ -791,6 +802,7 @@ cc_library(
     name = "coalescing_analysis",
     srcs = ["coalescing_analysis.cc"],
     hdrs = ["coalescing_analysis.h"],
+    tags = ["gpu"],
     deps = [
         ":affine_map_evaluator",
         ":indexing_analysis",
@@ -814,6 +826,7 @@ cc_library(
 xla_cc_test(
     name = "coalescing_analysis_test",
     srcs = ["coalescing_analysis_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":coalescing_analysis",
         ":symbolic_tile",
diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD
index f0eb7f9481a14..4cfe8ed5c4adf 100644
--- a/xla/service/gpu/tests/BUILD
+++ b/xla/service/gpu/tests/BUILD
@@ -674,6 +674,7 @@ lit_test_suite(
     ],
     default_tags = tf_cuda_tests_tags(),
     hermetic_cuda_data_dir = "%S/../../../../../cuda_nvcc",
+    tags = ["gpu"],
     tags_override = {
         "element_wise_row_vectorization.hlo": ["cuda-only"],
         "scatter_bf16.hlo": ["cuda-only"],
@@ -702,6 +703,7 @@ lit_test_suite(
 # cc_binary(
 #     name = "xla-opt",
 #     srcs = ["xla-opt.cc"],
+#     tags = ["gpu"],
 #     deps = [
 #         "//xla/service/gpu/fusions/transforms:passes",
 #         "//xla/service/gpu/fusions/triton:passes",
diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD
index ed27641245360..0a2255048b8ba 100644
--- a/xla/service/gpu/transforms/BUILD
+++ b/xla/service/gpu/transforms/BUILD
@@ -3,10 +3,6 @@ load(
     "if_cuda_is_configured",
 )
 load("//xla:xla.bzl", "xla_cc_test")
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "if_oss")
 
@@ -300,7 +296,6 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/service:hlo_proto_cc",
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:verified_hlo_module",
@@ -339,6 +334,7 @@ cc_library(
     name = "fusion_block_level_rewriter",
     srcs = ["fusion_block_level_rewriter.cc"],
     hdrs = ["fusion_block_level_rewriter.h"],
+    tags = ["gpu"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -366,6 +362,7 @@ cc_library(
 xla_cc_test(
     name = "fusion_block_level_rewriter_test",
     srcs = ["fusion_block_level_rewriter_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":fusion_block_level_rewriter",
         "//xla:shape_util",
@@ -532,10 +529,7 @@ cc_library(
 xla_test(
     name = "command_buffer_scheduling_test",
     srcs = ["command_buffer_scheduling_test.cc"],
-    backends = [
-        "cpu",
-        "gpu",
-    ],
+    backends = ["gpu"],
     deps = [
         ":command_buffer_scheduling",
         "//xla/hlo/ir:hlo",
@@ -785,9 +779,6 @@ xla_cc_test(
     srcs = ["cudnn_custom_call_converter_test.cc"],
     deps = [
         ":cudnn_custom_call_converter",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
@@ -930,6 +921,7 @@ xla_test(
         ":cudnn_fused_mha_rewriter",
         ":cudnn_fused_mha_transpose_fusion",
         "//xla:error_spec",
+        "//xla:shape_util",
         "//xla:test_helpers",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -991,9 +983,20 @@ cc_library(
 # Tested via //third_party/tensorflow/compiler/xla/service/gpu/fusions:cudnn_test
 cc_library(
     name = "cudnn_fusion_compiler",
-    srcs = if_cuda_is_configured(["cudnn_fusion_compiler.cc"]),
-    hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]),
-    deps = if_cuda_is_configured([
+    srcs = ["cudnn_fusion_compiler.cc"],
+    hdrs = ["cudnn_fusion_compiler.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla:comparison_util",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:dump",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cudnn_support_utils",
         "//xla/service/gpu:ir_emission_utils",
@@ -1001,6 +1004,10 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu:triton_fusion_analysis",
+        "//xla/stream_executor:dnn",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cudnn_frontend_helpers",
+        "//xla/stream_executor/cuda:cudnn_plugin",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1010,27 +1017,19 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_config_cuda//cuda:cudnn_header",
-        "//xla:shape_util",
-        "//xla:comparison_util",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/stream_executor:dnn",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/service:dump",
-        "//xla/stream_executor/cuda:cudnn_frontend_helpers",
-        "//xla/stream_executor/cuda:cudnn_plugin",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:statusor",
-    ]),
+    ],
 )
 
 cc_library(
     name = "cudnn_norm_rewriter",
     srcs = ["cudnn_norm_rewriter.cc"],
     hdrs = ["cudnn_norm_rewriter.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
     deps = [
         "//xla:shape_util",
         "//xla:types",
@@ -1051,13 +1050,11 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cudnn_header",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:logging",
         "@tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_cuda//cuda:cudnn_header",
-    ]) + if_google([
+    ] + if_google([
         "@com_google_protobuf//:wrappers_cc_proto",
     ]),
 )
@@ -1066,7 +1063,7 @@ xla_test(
     name = "cudnn_norm_rewriter_test",
     srcs = ["cudnn_norm_rewriter_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["cuda-only"],
     deps = [
         ":cudnn_norm_rewriter",
         "//xla:error_spec",
@@ -1076,10 +1073,9 @@ xla_test(
         "//xla/tests:filecheck",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest_main",
-    ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1238,34 +1234,37 @@ xla_cc_test(
 # TODO(b/358278858): Currently lacking test coverage.
 cc_library(
     name = "cudnn_custom_call_compiler",
-    srcs = if_cuda_is_configured(["cudnn_custom_call_compiler.cc"]),
-    hdrs = if_cuda_is_configured(["cudnn_custom_call_compiler.h"]),
-    deps = if_cuda_is_configured([
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_config_cuda//cuda:cudnn_header",
+    srcs = ["cudnn_custom_call_compiler.cc"],
+    hdrs = ["cudnn_custom_call_compiler.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/service/gpu/runtime:cudnn_thunk",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/runtime:cudnn_thunk",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cudnn_frontend_helpers",
         "//xla/stream_executor/cuda:cudnn_plugin",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:statusor",
-    ]) + ["@com_google_absl//absl/container:flat_hash_map"],
+    ],
 )
 
 cc_library(
@@ -1363,31 +1362,25 @@ cc_library(
 
 xla_test(
     name = "dot_operand_converter_test",
-    srcs = if_gpu_is_configured(["dot_operand_converter_test.cc"]),
+    srcs = ["dot_operand_converter_test.cc"],
     backends = [
         "gpu_a100",
         "gpu_p100",
         "gpu_v100",
         "gpu_amd_any",
     ],
-    deps = if_gpu_is_configured(
-        [
-            ":dot_operand_converter",
-            "@com_google_googletest//:gtest",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:string_view",
-            "//xla:shape_util",
-            "//xla:xla_data_proto_cc",
-            "//xla/hlo/ir:hlo",
-            "//xla/hlo/utils:hlo_matchers",
-            "//xla/service:pattern_matcher",
-            "//xla/tests:hlo_test_base",
-            "//xla/tests:xla_internal_test_main",
-            "@tsl//tsl/platform:statusor",
-        ],
-    ) + [
-        # b/317293391
-        "@tsl//tsl/platform:test_main",
+    deps = [
+        ":dot_operand_converter",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1527,19 +1520,14 @@ xla_cc_test(
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_value",
         "//xla/service:custom_call_target_registry",
-        "//xla/service:executable",
-        "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
-        "@tsl//tsl/platform:status",
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/platform:test",
         "@tsl//tsl/platform:test_main",
@@ -1550,6 +1538,7 @@ cc_library(
     name = "fusion_merger",
     srcs = ["fusion_merger.cc"],
     hdrs = ["fusion_merger.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:shape_util",
         "//xla:util",
@@ -1578,6 +1567,7 @@ xla_cc_test(
     name = "fusion_merger_test",
     srcs = ["fusion_merger_test.cc"],
     tags = [
+        "gpu",
         "nomsan",
     ],
     deps = [
@@ -1835,13 +1825,10 @@ xla_cc_test(
 # TODO(b/358278858): Currently lacking test coverage.
 cc_library(
     name = "gpusolver_rewriter",
-    srcs = if_gpu_is_configured(["gpusolver_rewriter.cc"]),
-    hdrs = if_gpu_is_configured(["gpusolver_rewriter.h"]),
-    deps = if_gpu_is_configured([
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
+    srcs = ["gpusolver_rewriter.cc"],
+    hdrs = ["gpusolver_rewriter.h"],
+    tags = ["gpu"],
+    deps = [
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
@@ -1852,14 +1839,15 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service/gpu:cusolver_context",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/stream_executor",
         "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:logging",
-        "@tsl//tsl/platform:status",
         "@tsl//tsl/platform:statusor",
-    ]),
+    ],
 )
 
 cc_library(
@@ -2103,6 +2091,7 @@ cc_library(
     name = "multi_output_fusion",
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -2134,6 +2123,7 @@ xla_cc_test(
     name = "multi_output_fusion_test",
     srcs = ["multi_output_fusion_test.cc"],
     tags = [
+        "gpu",
         "nomsan",
     ],
     deps = [
@@ -2252,6 +2242,7 @@ cc_library(
     name = "priority_fusion",
     srcs = ["priority_fusion.cc"],
     hdrs = ["priority_fusion.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -2302,7 +2293,7 @@ cc_library(
 xla_cc_test(
     name = "priority_fusion_test",
     srcs = ["priority_fusion_test.cc"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["gpu"],
     deps = [
         ":priority_fusion",
         "//xla:shape_util",
@@ -2688,6 +2679,7 @@ cc_library(
     name = "softmax_rewriter_triton",
     srcs = ["softmax_rewriter_triton.cc"],
     hdrs = ["softmax_rewriter_triton.h"],
+    tags = ["gpu"],
     deps = [
         ":reduction_dimension_grouper",
         ":reduction_splitter",
@@ -2734,6 +2726,7 @@ cc_library(
 xla_cc_test(
     name = "softmax_rewriter_triton_test",
     srcs = ["softmax_rewriter_triton_test.cc"],
+    tags = ["gpu"],
     deps = [
         ":softmax_rewriter_triton",
         "//xla:shape_util",
@@ -2759,11 +2752,9 @@ xla_cc_test(
 
 cc_library(
     name = "sort_rewriter",
-    srcs = if_gpu_is_configured(
-        ["sort_rewriter.cc"],
-        ["sort_rewriter_stub.cc"],
-    ),
+    srcs = ["sort_rewriter.cc"],
     hdrs = ["sort_rewriter.h"],
+    tags = ["gpu"],
     deps = [
         "//xla:comparison_util",
         "//xla:shape_util",
@@ -2786,7 +2777,7 @@ cc_library(
 
 xla_test(
     name = "sort_rewriter_test",
-    srcs = if_cuda_is_configured(["sort_rewriter_test.cc"]),
+    srcs = ["sort_rewriter_test.cc"],
     backends = ["gpu"],
     tags = ["no_oss"],
     deps = [
diff --git a/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc b/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc
index a64fd0624bea6..d87490fcfec47 100644
--- a/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc
+++ b/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <optional>
-#include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
-#include "xla/error_spec.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/computation_layout.h"
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/service/reshape_decomposer.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/test_helpers.h"
@@ -49,11 +50,6 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
-#endif
-
 namespace xla {
 namespace gpu {
 namespace {
@@ -87,7 +83,7 @@ class CudnnFusedMhaRewriterTestHloTest : public HloTestBase {
       : HloTestBase(/*verifier_layout_sensitive=*/false,
                     /*allow_mixed_precision_in_hlo_verifier=*/false,
                     /*instruction_can_change_layout_func=*/{}) {
-#if !defined(GOOGLE_CUDA) || CUDA_VERSION < 12000
+#if CUDA_VERSION < 12000
     skip_reason_ = "cuDNN fused MHA requires CUDA 12 or later.";
     return;
 #endif
diff --git a/xla/service/gpu/transforms/cudnn_norm_rewriter.cc b/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
index 752549dc7ec50..8bbdfde9118c7 100644
--- a/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
+++ b/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
@@ -49,12 +49,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
-#include "third_party/gpus/cudnn/cudnn.h"        // IWYU pragma: keep
 #include "third_party/gpus/cudnn/cudnn_version.h"
-#endif
 
 namespace xla {
 namespace gpu {
diff --git a/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc b/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
index a3dbc71132949..255799b84c174 100644
--- a/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
+++ b/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
@@ -16,16 +16,12 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "xla/error_spec.h"
-#include "xla/stream_executor/device_description.h"
-
-#if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
 #include "third_party/gpus/cudnn/cudnn_version.h"
-#endif
-
+#include "xla/error_spec.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
diff --git a/xla/service/gpu/transforms/sort_rewriter_stub.cc b/xla/service/gpu/transforms/sort_rewriter_stub.cc
deleted file mode 100644
index e9bf60cdb4c9b..0000000000000
--- a/xla/service/gpu/transforms/sort_rewriter_stub.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/transforms/sort_rewriter.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-absl::StatusOr<bool> SortRewriter::RunOnInstruction(
-    HloSortInstruction* sort_op) {
-  return false;
-}
-
-absl::StatusOr<bool> SortRewriter::RunOnComputation(
-    HloComputation* computation) {
-  return false;
-}
-
-absl::StatusOr<bool> SortRewriter::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  return false;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/xla/tools/hlo_opt/BUILD b/xla/tools/hlo_opt/BUILD
index 0a86b663a69df..f4b1c028b393b 100644
--- a/xla/tools/hlo_opt/BUILD
+++ b/xla/tools/hlo_opt/BUILD
@@ -55,7 +55,7 @@ cc_library(
 cc_library(
     name = "gpu_opt",
     testonly = True,
-    srcs = if_gpu_is_configured(["gpu_opt.cc"]),
+    srcs = ["gpu_opt.cc"],
     tags = ["gpu"],
     deps = [
         ":opt_lib",
@@ -66,12 +66,14 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:dump",
         "//xla/service:executable",
+        "//xla/service:gpu_plugin",
         "//xla/service:hlo_graph_dumper",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_sharing",
         "//xla/service/gpu:compile_module_to_llvm_ir",
         "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
+        "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
@@ -82,10 +84,7 @@ cc_library(
         "@llvm-project//llvm:ir_headers",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        "//xla/service:gpu_plugin",
-        "//xla/service/gpu:gpu_executable",
-    ]) + if_cuda_is_configured([
+    ] + if_cuda_is_configured([
         "//xla/stream_executor:cuda_platform",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor:rocm_platform",
@@ -176,6 +175,7 @@ lit_test_suite(
     data = [":test_utilities"],
     default_tags = tf_cuda_tests_tags(),
     hermetic_cuda_data_dir = "%S/../../../../cuda_nvcc",
+    tags = ["gpu"],
     tags_override = {
         "gpu_hlo_ptx.hlo": ["cuda-only"],
     },