diff --git a/xla/lit.bzl b/xla/lit.bzl index 102ea787ac119..936db23d5c8f3 100644 --- a/xla/lit.bzl +++ b/xla/lit.bzl @@ -246,6 +246,8 @@ def lit_test( ) test_file = output_file + env["LD_LIBRARY_PATH"] = lib_dir + native_test( name = name, src = lit_name, diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD index a35ab7cab957a..c166653cd122c 100644 --- a/xla/service/gpu/BUILD +++ b/xla/service/gpu/BUILD @@ -146,7 +146,7 @@ cc_library( xla_test( name = "custom_call_test", - srcs = if_gpu_is_configured(["custom_call_test.cc"]), + srcs = ["custom_call_test.cc"], backends = ["gpu"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), deps = [ @@ -278,6 +278,7 @@ cc_library( name = "ir_emitter_context", srcs = ["ir_emitter_context.cc"], hdrs = ["ir_emitter_context.h"], + tags = ["gpu"], deps = [ ":execution_stream_assignment", ":gpu_constants", @@ -308,6 +309,7 @@ cc_library( ]) + if_rocm_hipblaslt([ "TF_HIPBLASLT=1", ]), + tags = ["gpu"], deps = [ ":backend_configs_cc", ":cublas_cudnn", @@ -350,12 +352,14 @@ cc_library( "//xla/service/gpu/kernels:custom_kernel", "//xla/service/gpu/kernels:topk_custom_kernel", "//xla/service/gpu/model:tiled_hlo_instruction_or_computation", + "//xla/service/gpu/runtime:cholesky_thunk", "//xla/service/gpu/runtime:command_buffer_cmd", "//xla/service/gpu/runtime:command_buffer_cmd_emitter", "//xla/service/gpu/runtime:command_buffer_thunk", "//xla/service/gpu/runtime:conditional_thunk", "//xla/service/gpu/runtime:convolution_thunk", "//xla/service/gpu/runtime:copy_thunk", + "//xla/service/gpu/runtime:cub_sort_thunk", "//xla/service/gpu/runtime:cudnn_thunk", "//xla/service/gpu/runtime:custom_call_thunk", "//xla/service/gpu/runtime:fft_thunk", @@ -367,6 +371,7 @@ cc_library( "//xla/service/gpu/runtime:nccl_all_reduce_thunk", "//xla/service/gpu/runtime:nccl_all_to_all_thunk", "//xla/service/gpu/runtime:nccl_api", + "//xla/service/gpu/runtime:nccl_clique_key", "//xla/service/gpu/runtime:nccl_collective_broadcast_thunk", "//xla/service/gpu/runtime:nccl_collective_permute_thunk", "//xla/service/gpu/runtime:nccl_collective_thunk", @@ -379,6 +384,7 @@ cc_library( "//xla/service/gpu/runtime:send_recv_thunk", "//xla/service/gpu/runtime:sequential_thunk", "//xla/service/gpu/runtime:thunk", + "//xla/service/gpu/runtime:triangular_solve_thunk", "//xla/service/gpu/runtime:wait_for_streams_thunk", "//xla/service/gpu/runtime:while_thunk", "//xla/service/llvm_ir:buffer_assignment_util", @@ -390,6 +396,7 @@ cc_library( "//xla/service/llvm_ir:sort_util", "//xla/stream_executor:device_description", "//xla/stream_executor:launch_dim", + "//xla/stream_executor:stream_executor_h", "//xla/stream_executor/gpu:gpu_blas_lt", "//xla/stream_executor/integrations:device_mem_allocator", "//xla/tsl/protobuf:dnn_proto_cc", @@ -421,11 +428,7 @@ cc_library( "@tsl//tsl/platform:errors", "@tsl//tsl/platform:human_readable_json", "@tsl//tsl/platform:statusor", - ] + if_gpu_is_configured([ - "//xla/service/gpu/runtime:cholesky_thunk", - "//xla/service/gpu/runtime:cub_sort_thunk", - "//xla/service/gpu/runtime:triangular_solve_thunk", - ]) + if_rocm_is_configured([ + ] + if_rocm_is_configured([ "@local_config_rocm//rocm:rocm_headers", ]), ) @@ -442,7 +445,7 @@ cc_library( "ir_emitter.h", "ir_emitter_nested.h", ], - copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":backend_configs_cc", ":hlo_to_ir_bindings", @@ -486,11 +489,9 @@ cc_library( cc_library( name = "triton_call", - srcs = if_gpu_is_configured(["triton_call.cc"]), + srcs = ["triton_call.cc"], hdrs = ["triton_call.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), + tags = ["gpu"], deps = [ "@llvm-project//mlir:AsmParser", "@llvm-project//mlir:IR", @@ -553,15 +554,14 @@ cc_library( hdrs = [ "gpu_executable.h", ], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), + tags = ["gpu"], deps = [ ":backend_configs_cc", ":buffer_allocations", ":gpu_constants", ":gpu_executable_run_options", ":ir_emission_utils", + ":make_batch_pointers", ":stream_executor_util", "//xla:executable_run_options", "//xla:shape_tree", @@ -621,13 +621,8 @@ cc_library( "@tsl//tsl/profiler/lib:scoped_annotation", "@tsl//tsl/profiler/lib:traceme", ] + if_cuda_is_configured([ - "//xla/stream_executor/cuda:cublas_plugin", - "//xla/stream_executor/cuda:cudnn_plugin", - "//xla/stream_executor/cuda:cufft_plugin", - "//xla/stream_executor/cuda:stream_executor_cuda", "@local_config_cuda//cuda:cuda_headers", ]) + if_rocm_is_configured([ - "//xla/stream_executor/rocm:stream_executor_rocm", "@local_config_rocm//rocm:rocm_headers", ]), ) @@ -766,16 +761,17 @@ cc_library( build_cub_sort_kernels( name = "cub_sort_kernel", - srcs = if_gpu_is_configured(["cub_sort_kernel.cu.cc"]), - hdrs = if_gpu_is_configured(["cub_sort_kernel.h"]), + srcs = ["cub_sort_kernel.cu.cc"], + hdrs = ["cub_sort_kernel.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ "TENSORFLOW_USE_ROCM=1", ]), + tags = ["gpu"], types = get_cub_sort_kernel_types(), - deps = if_gpu_is_configured([ + deps = [ ":gpu_prim", "//xla/stream_executor/gpu:gpu_types_header", - ]), + ], ) cc_library( @@ -921,9 +917,7 @@ cc_library( srcs = ["matmul_utils.cc"], hdrs = ["matmul_utils.h"], compatible_with = get_compatible_with_portable(), - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), + local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), deps = [ ":backend_configs_cc", ":ir_emission_utils", @@ -938,6 +932,7 @@ cc_library( "//xla/service:algorithm_util", "//xla/stream_executor", "//xla/stream_executor:blas", + "//xla/stream_executor:host_or_device_scalar", "//xla/stream_executor:numeric_options", "//xla/stream_executor/gpu:gpu_blas_lt", "@com_google_absl//absl/algorithm:container", @@ -950,10 +945,7 @@ cc_library( "@tsl//tsl/platform:errors", "@tsl//tsl/platform:status", "@tsl//tsl/platform:statusor", - ] + if_gpu_is_configured([ - #keep sorted - "//xla/stream_executor:host_or_device_scalar", - ]) + if_cuda_is_configured([ + ] + if_cuda_is_configured([ #keep sorted "//xla/stream_executor/cuda:cublas_lt_header", "//xla/stream_executor/cuda:cublas_plugin", @@ -1034,9 +1026,7 @@ cc_library( "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", "@tsl//tsl/platform:statusor", - ] + if_cuda_is_configured([ - "@local_config_cuda//cuda:cuda_headers", - ]), + ], ) cc_library( @@ -1253,7 +1243,7 @@ cc_library( hdrs = [ "compile_module_to_llvm_ir.h", ], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":executable_proto_cc", ":execution_stream_assignment", @@ -1305,6 +1295,7 @@ cc_library( name = "fusion_pipeline", srcs = ["fusion_pipeline.cc"], hdrs = ["fusion_pipeline.h"], + tags = ["gpu"], deps = [ "//xla:xla_proto_cc", "//xla/hlo/pass:hlo_pass", @@ -1353,14 +1344,14 @@ cc_library( cc_library( name = "gpu_compiler", - srcs = if_gpu_is_configured([ + srcs = [ "gpu_compiler.cc", - ]), - hdrs = if_gpu_is_configured([ + ], + hdrs = [ "gpu_compiler.h", - ]), - deps = if_gpu_is_configured([ - # go/keep-sorted start prefix_order=":,, + ], + tags = ["gpu"], + deps = [ ":buffer_sharing", ":compile_module_to_llvm_ir", ":conv_layout_normalization", @@ -1387,27 +1378,14 @@ cc_library( ":reduction_utils", ":runtime_intrinsics", ":stream_executor_util", - "@com_google_absl//absl/base", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", - "@com_google_absl//absl/types:variant", - "@llvm-project//llvm:AsmParser", - "@llvm-project//llvm:BitReader", - "@llvm-project//llvm:BitWriter", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - "@llvm-project//llvm:TransformUtils", - "@llvm-project//mlir:FuncDialect", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:Pass", - "@llvm-project//mlir:Support", + "//xla:autotune_results_proto_cc", + "//xla:debug_options_flags", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla:xla_proto_cc", "//xla/hlo/ir:hlo", "//xla/hlo/ir:hlo_module_group", "//xla/hlo/pass:hlo_pass", @@ -1415,58 +1393,6 @@ cc_library( "//xla/hlo/translate/hlo_to_mhlo:hlo_utils", "//xla/hlo/translate/mhlo_to_hlo:location_exporter", "//xla/pjrt/distributed:key_value_store_interface", - "//xla/service/gpu/autotuning:autotuner_util", - "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner", - "//xla/service/gpu/fusions/triton:triton_support", - "//xla/service/gpu/model:gpu_cost_model_stats_collection", - "//xla/service/gpu/model:gpu_hlo_cost_analysis", - "//xla/service/gpu/runtime:thunk", - "//xla/service/gpu/transforms:algebraic_simplifier", - "//xla/service/gpu/transforms:algorithm_checker", - "//xla/service/gpu/transforms:all_gather_optimizer", - "//xla/service/gpu/transforms:all_reduce_blueconnect", - "//xla/service/gpu/transforms:all_reduce_splitter", - "//xla/service/gpu/transforms:async_collective_annotator", - "//xla/service/gpu/transforms:async_wrapper", - "//xla/service/gpu/transforms:collective_permute_cycle_decomposer", - "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator", - "//xla/service/gpu/transforms:command_buffer_scheduling", - "//xla/service/gpu/transforms:conv_rewriter", - "//xla/service/gpu/transforms:convert_async_collectives_to_sync", - "//xla/service/gpu/transforms:cudnn_custom_call_converter", - "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter", - "//xla/service/gpu/transforms:dot_dimension_sorter", - "//xla/service/gpu/transforms:dot_operand_converter", - "//xla/service/gpu/transforms:double_buffer_loop_unrolling", - "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter", - "//xla/service/gpu/transforms:fusion_wrapper", - "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter", - "//xla/service/gpu/transforms:gemm_fusion", - "//xla/service/gpu/transforms:gemm_rewriter", - "//xla/service/gpu/transforms:gemv_rewriter", - "//xla/service/gpu/transforms:layout_assignment", - "//xla/service/gpu/transforms:move_copy_to_users", - "//xla/service/gpu/transforms:pipelined_p2p_rewriter", - "//xla/service/gpu/transforms:reduce_scatter_creator", - "//xla/service/gpu/transforms:reduction_degenerate_dim_remover", - "//xla/service/gpu/transforms:reduction_dimension_grouper", - "//xla/service/gpu/transforms:reduction_layout_normalizer", - "//xla/service/gpu/transforms:reduction_splitter", - "//xla/service/gpu/transforms:rename_fusions", - "//xla/service/gpu/transforms:sanitize_constant_names", - "//xla/service/gpu/transforms:scatter_expander", - "//xla/service/gpu/transforms:scatter_slice_simplifier", - "//xla/service/gpu/transforms:softmax_rewriter_triton", - "//xla/service/gpu/transforms:stream_attribute_annotator", - "//xla/service/gpu/transforms:stream_attribute_async_wrapper", - "//xla/service/gpu/transforms:topk_specializer", - "//xla/service/gpu/transforms:topk_splitter", - "//xla/service/gpu/transforms:transpose_dimension_grouper", - "//xla/service/gpu/transforms:tree_reduction_rewriter", - "//xla/service/gpu/transforms:triton_fusion_numerics_verifier", - "//xla/service/gpu/transforms:windowed_einsum_handler", - "//xla/service/llvm_ir:llvm_util", - "//xla/service/spmd:collective_permute_motion", "//xla/service:algebraic_simplifier", "//xla/service:all_gather_broadcast_reorder", "//xla/service:all_gather_combiner", @@ -1565,23 +1491,86 @@ cc_library( "//xla/service:while_loop_simplifier", "//xla/service:while_loop_trip_count_annotator", "//xla/service:zero_sized_hlo_elimination", + "//xla/service/gpu/autotuning:autotuner_util", + "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner", + "//xla/service/gpu/fusions/triton:triton_support", + "//xla/service/gpu/model:gpu_cost_model_stats_collection", + "//xla/service/gpu/model:gpu_hlo_cost_analysis", + "//xla/service/gpu/runtime:thunk", + "//xla/service/gpu/transforms:algebraic_simplifier", + "//xla/service/gpu/transforms:algorithm_checker", + "//xla/service/gpu/transforms:all_gather_optimizer", + "//xla/service/gpu/transforms:all_reduce_blueconnect", + "//xla/service/gpu/transforms:all_reduce_splitter", + "//xla/service/gpu/transforms:async_collective_annotator", + "//xla/service/gpu/transforms:async_wrapper", + "//xla/service/gpu/transforms:collective_permute_cycle_decomposer", + "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator", + "//xla/service/gpu/transforms:command_buffer_scheduling", + "//xla/service/gpu/transforms:conv_rewriter", + "//xla/service/gpu/transforms:convert_async_collectives_to_sync", + "//xla/service/gpu/transforms:cudnn_custom_call_converter", + "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter", + "//xla/service/gpu/transforms:dot_dimension_sorter", + "//xla/service/gpu/transforms:dot_operand_converter", + "//xla/service/gpu/transforms:double_buffer_loop_unrolling", + "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter", + "//xla/service/gpu/transforms:fusion_wrapper", + "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter", + "//xla/service/gpu/transforms:gemm_fusion", + "//xla/service/gpu/transforms:gemm_rewriter", + "//xla/service/gpu/transforms:gemv_rewriter", + "//xla/service/gpu/transforms:layout_assignment", + "//xla/service/gpu/transforms:move_copy_to_users", + "//xla/service/gpu/transforms:pipelined_p2p_rewriter", + "//xla/service/gpu/transforms:reduce_scatter_creator", + "//xla/service/gpu/transforms:reduction_degenerate_dim_remover", + "//xla/service/gpu/transforms:reduction_dimension_grouper", + "//xla/service/gpu/transforms:reduction_layout_normalizer", + "//xla/service/gpu/transforms:reduction_splitter", + "//xla/service/gpu/transforms:rename_fusions", + "//xla/service/gpu/transforms:sanitize_constant_names", + "//xla/service/gpu/transforms:scatter_expander", + "//xla/service/gpu/transforms:scatter_slice_simplifier", + "//xla/service/gpu/transforms:softmax_rewriter_triton", + "//xla/service/gpu/transforms:stream_attribute_annotator", + "//xla/service/gpu/transforms:stream_attribute_async_wrapper", + "//xla/service/gpu/transforms:topk_specializer", + "//xla/service/gpu/transforms:topk_splitter", + "//xla/service/gpu/transforms:transpose_dimension_grouper", + "//xla/service/gpu/transforms:tree_reduction_rewriter", + "//xla/service/gpu/transforms:triton_fusion_numerics_verifier", + "//xla/service/gpu/transforms:windowed_einsum_handler", + "//xla/service/llvm_ir:llvm_util", + "//xla/service/spmd:collective_permute_motion", "//xla/stream_executor", - "//xla/stream_executor/gpu:gpu_driver_header", - "//xla/stream_executor/integrations:device_mem_allocator", "//xla/stream_executor:device_description", "//xla/stream_executor:device_description_proto_cc", "//xla/stream_executor:dnn", "//xla/stream_executor:platform_manager", "//xla/stream_executor:semantic_version", "//xla/tsl/lib/monitoring:counter", - "//xla:autotune_results_proto_cc", - "//xla:debug_options_flags", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla:xla_proto_cc", + "@com_google_absl//absl/base", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@com_google_absl//absl/types:variant", + "@llvm-project//llvm:AsmParser", + "@llvm-project//llvm:BitReader", + "@llvm-project//llvm:BitWriter", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:TransformUtils", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:Support", "@tsl//tsl/platform:blocking_counter", "@tsl//tsl/platform:casts", "@tsl//tsl/platform:env", @@ -1594,8 +1583,7 @@ cc_library( "@tsl//tsl/platform:statusor", "@tsl//tsl/profiler/lib:scoped_annotation", "@tsl//tsl/profiler/lib:traceme", - # go/keep-sorted end - ]) + xla_internal(["service:export_hlo"]) + if_google([ + ] + xla_internal(["service:export_hlo"]) + if_google([ "//xla/hlo/experimental/auto_sharding", ]), ) @@ -1909,12 +1897,9 @@ xla_test( xla_cc_test( name = "gpu_aot_compilation_test", - srcs = if_gpu_is_configured([ + srcs = [ "gpu_aot_compilation_test.cc", - ]), - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), + ], tags = [ "gpu", "no_oss", @@ -1957,6 +1942,7 @@ cc_library( tags = [ "gpu", "manual", + "rocm-only", ], deps = [ ":amdgpu_compiler_impl", @@ -1977,6 +1963,7 @@ cc_library( tags = [ "gpu", "manual", + "rocm-only", ], deps = [ ":cublas_padding_requirements", @@ -2021,13 +2008,11 @@ cc_library( "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@llvm-project//llvm:ir_headers", + "@local_config_rocm//rocm:rocm_headers", "@tsl//tsl/platform:env", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:statusor", - ] + if_rocm_is_configured([ - # keep sorted - "@local_config_rocm//rocm:rocm_headers", - ]), + ], ) cc_library( @@ -2076,6 +2061,7 @@ cc_library( name = "gpu_hlo_schedule", srcs = ["gpu_hlo_schedule.cc"], hdrs = ["gpu_hlo_schedule.h"], + tags = ["gpu"], deps = [ ":backend_configs_cc", ":gpu_latency_hiding_scheduler", @@ -2386,13 +2372,13 @@ xla_cc_test( cc_library( name = "buffer_comparator", - srcs = if_gpu_is_configured(["buffer_comparator.cc"]), - hdrs = if_gpu_is_configured(["buffer_comparator.h"]), + srcs = ["buffer_comparator.cc"], + hdrs = ["buffer_comparator.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ "TENSORFLOW_USE_ROCM=1", ]), - deps = if_gpu_is_configured([ - # keep sorted + tags = ["gpu"], + deps = [ ":buffer_comparator_kernel", ":gpu_asm_opts_util", ":launch_dimensions", @@ -2412,7 +2398,7 @@ cc_library( "@tsl//tsl/platform:logging", "@tsl//tsl/platform:ml_dtypes", "@tsl//tsl/platform:statusor", - ]) + if_rocm_is_configured([ + ] + if_rocm_is_configured([ # keep sorted "@local_config_rocm//rocm:rocm_headers", ]), @@ -2420,7 +2406,7 @@ cc_library( gpu_kernel_library( name = "buffer_comparator_kernel", - srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]), + srcs = ["buffer_comparator.cu.cc"], copts = rocm_copts(), local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ "TENSORFLOW_USE_ROCM=1", @@ -2434,28 +2420,24 @@ gpu_kernel_library( xla_test( name = "buffer_comparator_test", - srcs = if_gpu_is_configured(["buffer_comparator_test.cc"]), + srcs = ["buffer_comparator_test.cc"], backends = ["gpu"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), deps = [ + ":buffer_comparator", ":stream_executor_util", "//xla:shape_util", "//xla:types", - "//xla/service:hlo_module_config", + "//xla/service:platform_util", "//xla/stream_executor", - "//xla/stream_executor:device_memory_allocator", + "//xla/stream_executor:device_memory", "//xla/stream_executor:device_memory_handle", "//xla/stream_executor:platform_manager", + "@com_google_absl//absl/strings", "@tsl//tsl/platform:ml_dtypes", "@tsl//tsl/platform:status", "@tsl//tsl/platform:test", "@tsl//tsl/platform:test_main", - ] + if_gpu_is_configured([ - ":buffer_comparator", - "//xla/stream_executor:device_memory", - ]), + ], ) cc_library( @@ -2630,8 +2612,9 @@ cc_library( cc_library( name = "make_batch_pointers", - srcs = if_gpu_is_configured(["make_batch_pointers.cc"]), - hdrs = if_gpu_is_configured(["make_batch_pointers.h"]), + srcs = ["make_batch_pointers.cc"], + hdrs = ["make_batch_pointers.h"], + tags = ["gpu"], deps = [ "//xla:types", "//xla:util", @@ -2652,7 +2635,8 @@ cc_library( cuda_library( name = "make_batch_pointers_kernel", - srcs = if_cuda_is_configured(["make_batch_pointers.cu.cc"]), + srcs = ["make_batch_pointers.cu.cc"], + tags = ["cuda-only"], deps = [ "@local_config_cuda//cuda:cuda_headers", # build_cleaner: keep ], @@ -2948,6 +2932,7 @@ cc_library( xla_cc_test( name = "gpu_latency_hiding_scheduler_test", srcs = ["gpu_latency_hiding_scheduler_test.cc"], + tags = ["gpu"], deps = [ ":gpu_hlo_schedule", ":gpu_latency_hiding_scheduler", diff --git a/xla/service/gpu/buffer_comparator_test.cc b/xla/service/gpu/buffer_comparator_test.cc index cbef669abab7b..99a280af12659 100644 --- a/xla/service/gpu/buffer_comparator_test.cc +++ b/xla/service/gpu/buffer_comparator_test.cc @@ -21,9 +21,10 @@ limitations under the License. #include #include +#include "absl/strings/ascii.h" #include "xla/primitive_util.h" #include "xla/service/gpu/stream_executor_util.h" -#include "xla/service/hlo_module_config.h" +#include "xla/service/platform_util.h" #include "xla/shape_util.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/device_memory_handle.h" @@ -44,13 +45,11 @@ constexpr double kDefaultTolerance = 0.1; class BufferComparatorTest : public testing::Test { protected: BufferComparatorTest() -#if GOOGLE_CUDA - : platform_(se::PlatformManager::PlatformWithName("CUDA").value()), -#elif TENSORFLOW_USE_ROCM - : platform_(se::PlatformManager::PlatformWithName("ROCM").value()), -#endif - stream_exec_(platform_->ExecutorForDevice(0).value()) { - } + : platform_(stream_executor::PlatformManager::PlatformWithName( + absl::AsciiStrToUpper( + PlatformUtil::CanonicalPlatformName("gpu").value())) + .value()), + stream_exec_(platform_->ExecutorForDevice(0).value()) {} // Take floats only for convenience. Still uses ElementType internally. template diff --git a/xla/service/gpu/fusions/BUILD b/xla/service/gpu/fusions/BUILD index f75f04c0f58c4..15d96fa51193c 100644 --- a/xla/service/gpu/fusions/BUILD +++ b/xla/service/gpu/fusions/BUILD @@ -12,6 +12,7 @@ cc_library( name = "in_place_dynamic_update_slice_mlir", srcs = ["in_place_dynamic_update_slice_mlir.cc"], hdrs = ["in_place_dynamic_update_slice_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:xla_data_proto_cc", @@ -39,6 +40,7 @@ cc_library( name = "copy", srcs = ["copy.cc"], hdrs = ["copy.h"], + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla:shape_util", @@ -61,6 +63,7 @@ cc_library( srcs = ["custom.cc"], hdrs = ["custom.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla:literal", @@ -168,6 +171,7 @@ cc_library( name = "fusion_emitter", srcs = ["fusion_emitter.cc"], hdrs = ["fusion_emitter.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ "//xla:shape_util", @@ -204,6 +208,7 @@ cc_library( name = "fusions", srcs = ["fusions.cc"], hdrs = ["fusions.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ ":concatenate_mlir", @@ -245,6 +250,7 @@ cc_library( name = "loop_mlir", srcs = ["loop_mlir.cc"], hdrs = ["loop_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:status_macros", @@ -274,6 +280,7 @@ cc_library( name = "scatter_mlir", srcs = ["scatter_mlir.cc"], hdrs = ["scatter_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:xla_data_proto_cc", @@ -306,6 +313,7 @@ cc_library( name = "transpose_mlir", srcs = ["transpose_mlir.cc"], hdrs = ["transpose_mlir.h"], + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla:permutation_util", @@ -340,6 +348,7 @@ cc_library( name = "triton", srcs = ["triton.cc"], hdrs = ["triton.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ ":fusion_emitter", @@ -379,7 +388,7 @@ cc_library( xla_cc_test( name = "triton_test", srcs = ["triton_test.cc"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":fusion_emitter", ":fusions", @@ -403,6 +412,7 @@ cc_library( srcs = ["cudnn.cc"], hdrs = ["cudnn.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":fusion_emitter", "//xla/hlo/ir:hlo", @@ -463,6 +473,7 @@ cc_library( name = "thunk_util", srcs = ["thunk_util.cc"], hdrs = ["thunk_util.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ "//xla:literal", @@ -482,6 +493,7 @@ cc_library( name = "reduction_base", srcs = ["reduction_base.cc"], hdrs = ["reduction_base.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ ":fusion_emitter", @@ -517,6 +529,7 @@ cc_library( name = "reduction_mlir", srcs = ["reduction_mlir.cc"], hdrs = ["reduction_mlir.h"], + tags = ["gpu"], deps = [ ":fusion_emitter", ":reduction_base", @@ -558,6 +571,7 @@ cc_library( xla_cc_test( name = "reduction_base_test", srcs = ["reduction_base_test.cc"], + tags = ["gpu"], deps = [ ":reduction_base", "//xla/service/gpu:gpu_device_info_for_tests", @@ -572,6 +586,7 @@ cc_library( name = "concatenate_mlir", srcs = ["concatenate_mlir.cc"], hdrs = ["concatenate_mlir.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla/hlo/ir:hlo", @@ -598,6 +613,7 @@ cc_library( name = "input_slices_mlir", srcs = ["input_slices_mlir.cc"], hdrs = ["input_slices_mlir.h"], + tags = ["gpu"], deps = [ "//xla:util", "//xla:xla_data_proto_cc", diff --git a/xla/service/gpu/fusions/ir/tests/BUILD b/xla/service/gpu/fusions/ir/tests/BUILD index 381d5a3220b1d..6a1e332a2e9aa 100644 --- a/xla/service/gpu/fusions/ir/tests/BUILD +++ b/xla/service/gpu/fusions/ir/tests/BUILD @@ -9,6 +9,7 @@ lit_test_suite( name = "tests", srcs = glob(["*.mlir"]), cfg = "//xla:lit.cfg.py", + tags = ["gpu"], tools = [ "//xla/service/gpu/fusions/tools:mlir_fusions_opt", "@llvm-project//llvm:FileCheck", diff --git a/xla/service/gpu/fusions/ir/tests/inlining.mlir b/xla/service/gpu/fusions/ir/tests/inlining.mlir index f15b37b040b84..bc721d4b7b4bf 100644 --- a/xla/service/gpu/fusions/ir/tests/inlining.mlir +++ b/xla/service/gpu/fusions/ir/tests/inlining.mlir @@ -1,4 +1,4 @@ -// RUN: mlir_fusions_opt %s -split-input-file -xla-erase-dead-functions -inline | FileCheck %s +// RUN: env LD_DEBUG=files,libs mlir_fusions_opt %s -split-input-file -xla-erase-dead-functions -inline | FileCheck %s module { func.func private @mul(%a: f32, %b: f32) -> f32 { diff --git a/xla/service/gpu/fusions/legacy/BUILD b/xla/service/gpu/fusions/legacy/BUILD index 5c1705714483c..63bed338ca919 100644 --- a/xla/service/gpu/fusions/legacy/BUILD +++ b/xla/service/gpu/fusions/legacy/BUILD @@ -10,6 +10,7 @@ cc_library( name = "in_place_dynamic_update_slice", srcs = ["in_place_dynamic_update_slice.cc"], hdrs = ["in_place_dynamic_update_slice.h"], + tags = ["gpu"], deps = [ "//xla/hlo/ir:hlo", "//xla/service/gpu:hlo_fusion_analysis", @@ -33,6 +34,7 @@ cc_library( xla_cc_test( name = "in_place_dynamic_update_slice_test", srcs = ["in_place_dynamic_update_slice_test.cc"], + tags = ["gpu"], deps = [ ":in_place_dynamic_update_slice", "//xla/service/gpu:gpu_device_info_for_tests", @@ -53,6 +55,7 @@ cc_library( name = "loop", srcs = ["loop.cc"], hdrs = ["loop.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:util", @@ -83,6 +86,7 @@ cc_library( xla_cc_test( name = "loop_test", srcs = ["loop_test.cc"], + tags = ["gpu"], deps = [ "//xla:status_macros", "//xla/service/gpu:gpu_device_info_for_tests", @@ -105,6 +109,7 @@ cc_library( name = "scatter", srcs = ["scatter.cc"], hdrs = ["scatter.h"], + tags = ["gpu"], deps = [ ":loop", "//xla:shape_util", @@ -135,6 +140,7 @@ cc_library( xla_cc_test( name = "scatter_test", srcs = ["scatter_test.cc"], + tags = ["gpu"], deps = [ ":scatter", "//xla/service/gpu:gpu_device_info_for_tests", @@ -155,6 +161,7 @@ cc_library( name = "tiling_util", srcs = ["tiling_util.cc"], hdrs = ["tiling_util.h"], + tags = ["gpu"], visibility = ["//xla/service/gpu:__subpackages__"], deps = [ "//xla:shape_util", @@ -183,6 +190,7 @@ cc_library( name = "reduction", srcs = ["reduction.cc"], hdrs = ["reduction.h"], + tags = ["gpu"], deps = [ ":tiling_util", "//xla:shape_util", @@ -236,6 +244,7 @@ cc_library( xla_cc_test( name = "reduction_test", srcs = ["reduction_test.cc"], + tags = ["gpu"], deps = [ ":reduction", "//xla/service/gpu:gpu_device_info_for_tests", @@ -255,6 +264,7 @@ cc_library( name = "concatenate", srcs = ["concatenate.cc"], hdrs = ["concatenate.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla/hlo/ir:hlo", @@ -282,6 +292,7 @@ cc_library( xla_cc_test( name = "concatenate_test", srcs = ["concatenate_test.cc"], + tags = ["gpu"], deps = [ ":concatenate", "//xla/service/gpu:gpu_device_info_for_tests", @@ -301,6 +312,7 @@ cc_library( name = "transpose", srcs = ["transpose.cc"], hdrs = ["transpose.h"], + tags = ["gpu"], deps = [ ":tiling_util", "//xla:permutation_util", @@ -335,6 +347,7 @@ cc_library( xla_cc_test( name = "transpose_test", srcs = ["transpose_test.cc"], + tags = ["gpu"], deps = [ ":transpose", "//xla:status_macros", @@ -357,6 +370,7 @@ cc_library( name = "input_slices", srcs = ["input_slices.cc"], hdrs = ["input_slices.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:util", @@ -388,6 +402,7 @@ cc_library( xla_cc_test( name = "input_slices_test", srcs = ["input_slices_test.cc"], + tags = ["gpu"], deps = [ ":input_slices", "//xla/service/gpu:gpu_device_info_for_tests", diff --git a/xla/service/gpu/fusions/mlir/BUILD b/xla/service/gpu/fusions/mlir/BUILD index 29764a403dbd9..6b02be5bc37c5 100644 --- a/xla/service/gpu/fusions/mlir/BUILD +++ b/xla/service/gpu/fusions/mlir/BUILD @@ -17,6 +17,7 @@ cc_library( name = "computation_partitioner", srcs = ["computation_partitioner.cc"], hdrs = ["computation_partitioner.h"], + tags = ["gpu"], deps = [ ":type_util", "//xla:shape_util", @@ -47,6 +48,7 @@ cc_library( xla_cc_test( name = "computation_partitioner_test", srcs = ["computation_partitioner_test.cc"], + tags = ["gpu"], deps = [ ":computation_partitioner", "//xla/hlo/ir:hlo", @@ -64,6 +66,7 @@ cc_library( name = "elemental_hlo_to_mlir", srcs = ["elemental_hlo_to_mlir.cc"], hdrs = ["elemental_hlo_to_mlir.h"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":type_util", @@ -113,6 +116,7 @@ cc_library( xla_cc_test( name = "elemental_hlo_to_mlir_test", srcs = ["elemental_hlo_to_mlir_test.cc"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":elemental_hlo_to_mlir", @@ -153,6 +157,7 @@ cc_library( name = "mlir_fusion_emitter", srcs = ["mlir_fusion_emitter.cc"], hdrs = ["mlir_fusion_emitter.h"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":elemental_hlo_to_mlir", @@ -231,6 +236,7 @@ cc_library( xla_cc_test( name = "mlir_fusion_emitter_test", srcs = ["mlir_fusion_emitter_test.cc"], + tags = ["gpu"], deps = [ ":computation_partitioner", ":mlir_fusion_emitter", diff --git a/xla/service/gpu/fusions/tests/BUILD b/xla/service/gpu/fusions/tests/BUILD index d3e3b665e75d3..71bec2f5d4472 100644 --- a/xla/service/gpu/fusions/tests/BUILD +++ b/xla/service/gpu/fusions/tests/BUILD @@ -10,6 +10,7 @@ lit_test_suite( srcs = glob(["**/*.hlo"]), cfg = "//xla:lit.cfg.py", default_tags = ["requires-gpu-sm80-only"], + tags = ["gpu"], tools = [ "//xla/service/gpu/fusions/tools:fusion_to_mlir", "//xla/service/gpu/fusions/tools:mlir_fusions_opt", diff --git a/xla/service/gpu/fusions/tools/BUILD b/xla/service/gpu/fusions/tools/BUILD index 5b6c18f409287..7d1abdcb0456b 100644 --- a/xla/service/gpu/fusions/tools/BUILD +++ b/xla/service/gpu/fusions/tools/BUILD @@ -8,6 +8,7 @@ package( xla_cc_binary( name = "mlir_fusions_opt", srcs = ["mlir_fusions_opt.cc"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ "//xla/mlir_hlo", @@ -41,6 +42,7 @@ cc_library( testonly = 1, srcs = ["test_lib.cc"], hdrs = ["test_lib.h"], + tags = ["gpu"], deps = [ "//xla:status_macros", "//xla/hlo/ir:hlo", @@ -75,6 +77,7 @@ xla_cc_binary( name = "fusion_to_mlir", testonly = 1, srcs = ["fusion_to_mlir.cc"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ ":test_lib", @@ -90,6 +93,7 @@ xla_cc_binary( name = "test_correctness", testonly = 1, srcs = ["test_correctness.cc"], + tags = ["gpu"], visibility = ["//xla/service/gpu/fusions:__subpackages__"], deps = [ ":test_lib", diff --git a/xla/service/gpu/fusions/transforms/BUILD b/xla/service/gpu/fusions/transforms/BUILD index b26e0ec1ab9cb..18ad748f86743 100644 --- a/xla/service/gpu/fusions/transforms/BUILD +++ b/xla/service/gpu/fusions/transforms/BUILD @@ -53,6 +53,7 @@ cc_library( "vectorize_loads_stores.cc", ], hdrs = ["passes.h"], + tags = ["gpu"], deps = [ ":passes_inc_gen", "//xla:shape_util", diff --git a/xla/service/gpu/fusions/transforms/tests/BUILD b/xla/service/gpu/fusions/transforms/tests/BUILD index 381d5a3220b1d..6a1e332a2e9aa 100644 --- a/xla/service/gpu/fusions/transforms/tests/BUILD +++ b/xla/service/gpu/fusions/transforms/tests/BUILD @@ -9,6 +9,7 @@ lit_test_suite( name = "tests", srcs = glob(["*.mlir"]), cfg = "//xla:lit.cfg.py", + tags = ["gpu"], tools = [ "//xla/service/gpu/fusions/tools:mlir_fusions_opt", "@llvm-project//llvm:FileCheck", diff --git a/xla/service/gpu/fusions/triton/BUILD b/xla/service/gpu/fusions/triton/BUILD index 65acf4e075e8f..74fb1decf7659 100644 --- a/xla/service/gpu/fusions/triton/BUILD +++ b/xla/service/gpu/fusions/triton/BUILD @@ -34,6 +34,7 @@ cc_library( "compilation_pipeline_rocm.cc", ]), hdrs = ["triton_fusion_emitter.h"], + tags = ["gpu"], deps = [ ":passes", "//xla:autotuning_proto_cc", @@ -320,6 +321,7 @@ cc_library( testonly = True, srcs = ["triton_test_utils.cc"], hdrs = ["triton_test_utils.h"], + tags = ["gpu"], deps = [ ":triton_fusion_emitter", "//xla:shape_util", @@ -357,6 +359,7 @@ cc_library( xla_cc_test( name = "triton_fusion_emitter_mem_utils_test", srcs = if_cuda_is_configured(["triton_fusion_emitter_mem_utils_test.cc"]), + tags = ["gpu"], deps = [ ":triton_fusion_emitter", "//xla/hlo/ir:hlo", diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc index e43835397d095..932dc31cc5f9b 100644 --- a/xla/service/gpu/gpu_executable.cc +++ b/xla/service/gpu/gpu_executable.cc @@ -72,9 +72,7 @@ limitations under the License. #include "xla/stream_executor/device_memory_allocator.h" #include "xla/stream_executor/event_based_timer.h" #include "xla/stream_executor/gpu/scoped_activate_context.h" -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "xla/stream_executor/gpu/gpu_executor.h" -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "xla/stream_executor/module_spec.h" #include "xla/stream_executor/platform.h" #include "xla/stream_executor/rocm/rocm_platform_id.h" @@ -808,12 +806,10 @@ absl::StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator(); se::StreamExecutor* executor = run_options->stream()->parent(); -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // GpuExecutable always bound to a single GpuContext during its execution, so // we activate it once to skip expensive context activations later. se::gpu::GpuExecutor* gpu_executor = se::gpu::ExtractGpuExecutor(executor); se::gpu::ScopedActivateContext activation(gpu_executor); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Force synchronous execution if the allocator requires it. const bool block_host_until_done = diff --git a/xla/service/gpu/kernels/BUILD b/xla/service/gpu/kernels/BUILD index c155276748894..4bde4d38cebe3 100644 --- a/xla/service/gpu/kernels/BUILD +++ b/xla/service/gpu/kernels/BUILD @@ -1,12 +1,10 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library") -load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured") load( "@tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured", ) load("//xla:xla.bzl", "xla_cc_binary") load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library") -load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured") load("//xla/tests:build_defs.bzl", "DEFAULT_DISABLED_BACKENDS", "xla_test") load("//xla/tsl:tsl.bzl", "if_windows") @@ -210,12 +208,11 @@ cc_library( name = "topk_custom_kernel", srcs = ["topk_custom_kernel.cc"], hdrs = ["topk_custom_kernel.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ - "TENSORFLOW_USE_ROCM=1", - ]), + tags = ["gpu"], visibility = [":friends"], deps = [ ":custom_kernel", + ":topk_kernel_gpu", "//xla:types", "//xla:xla_data_proto_cc", "//xla/stream_executor", @@ -225,9 +222,7 @@ cc_library( "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", "@tsl//tsl/platform:statusor", - ] + if_gpu_is_configured([ - ":topk_kernel_gpu", - ]), + ], ) xla_test( diff --git a/xla/service/gpu/kernels/topk_custom_kernel.cc b/xla/service/gpu/kernels/topk_custom_kernel.cc index 2be74bf301ee1..a2611258acc10 100644 --- a/xla/service/gpu/kernels/topk_custom_kernel.cc +++ b/xla/service/gpu/kernels/topk_custom_kernel.cc @@ -27,6 +27,7 @@ limitations under the License. #include "absl/status/statusor.h" #include "absl/strings/str_cat.h" #include "xla/service/gpu/kernels/custom_kernel.h" +#include "xla/service/gpu/kernels/topk_kernel_common.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/kernel_spec.h" @@ -35,14 +36,8 @@ limitations under the License. #include "xla/xla_data.pb.h" #include "tsl/platform/statusor.h" -#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) -#include "xla/service/gpu/kernels/topk_kernel_common.h" -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - namespace xla::gpu::kernel::topk { -#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - namespace { using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking; @@ -135,16 +130,4 @@ absl::StatusOr GetTopKKernel(std::string name, } } -#else - -// Fallback implementation of creating a CustomKernel for TopK operation. -absl::StatusOr GetTopKKernel(std::string name, - PrimitiveType dtype, - size_t num_elements, size_t k, - size_t batch_size) { - return absl::InternalError("XLA compiled without CUDA support"); -} - -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - } // namespace xla::gpu::kernel::topk diff --git a/xla/service/gpu/matmul_utils.h b/xla/service/gpu/matmul_utils.h index 5f128e418af58..02605ea0c9a6a 100644 --- a/xla/service/gpu/matmul_utils.h +++ b/xla/service/gpu/matmul_utils.h @@ -36,10 +36,6 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_blas_lt.h" #include "xla/xla_data.pb.h" -#if TENSORFLOW_USE_ROCM -#include "rocm/rocm_config.h" -#endif - namespace xla { namespace gpu { diff --git a/xla/service/gpu/model/BUILD b/xla/service/gpu/model/BUILD index 756a7a3330440..ebb795cd6433b 100644 --- a/xla/service/gpu/model/BUILD +++ b/xla/service/gpu/model/BUILD @@ -25,6 +25,7 @@ cc_library( name = "analytical_latency_estimator", srcs = ["analytical_latency_estimator.cc"], hdrs = ["analytical_latency_estimator.h"], + tags = ["gpu"], deps = [ ":gpu_collective_performance_model", ":gpu_hlo_cost_analysis", @@ -102,6 +103,7 @@ cc_library( name = "gpu_cost_model_stats_collection", srcs = ["gpu_cost_model_stats_collection.cc"], hdrs = ["gpu_cost_model_stats_collection.h"], + tags = ["gpu"], deps = [ ":gpu_hlo_cost_analysis", ":gpu_performance_model", @@ -121,6 +123,7 @@ cc_library( xla_cc_test( name = "gpu_cost_model_stats_collection_test", srcs = ["gpu_cost_model_stats_collection_test.cc"], + tags = ["gpu"], deps = [ ":gpu_cost_model_stats_collection", ":gpu_hlo_cost_analysis", @@ -191,6 +194,7 @@ cc_library( name = "gpu_performance_model_base", srcs = ["gpu_performance_model_base.cc"], hdrs = ["gpu_performance_model_base.h"], + tags = ["gpu"], deps = [ ":fusion_analysis_cache", ":gpu_hlo_cost_analysis", @@ -219,6 +223,7 @@ cc_library( xla_cc_test( name = "gpu_performance_model_base_test", srcs = ["gpu_performance_model_base_test.cc"], + tags = ["gpu"], deps = [ ":gpu_hlo_cost_analysis", ":gpu_performance_model_base", @@ -241,6 +246,7 @@ cc_library( name = "gpu_performance_model", srcs = ["gpu_performance_model.cc"], hdrs = ["gpu_performance_model.h"], + tags = ["gpu"], deps = [ ":coalescing_analysis", ":gpu_hlo_cost_analysis", @@ -263,6 +269,7 @@ cc_library( xla_cc_test( name = "gpu_performance_model_test", srcs = ["gpu_performance_model_test.cc"], + tags = ["gpu"], deps = [ ":fusion_analysis_cache", ":gpu_hlo_cost_analysis", @@ -294,6 +301,7 @@ cc_library( srcs = ["gpu_collective_performance_model.cc"], hdrs = ["gpu_collective_performance_model.h"], local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":coalescing_analysis", ":fusion_analysis_cache", @@ -332,6 +340,7 @@ cc_library( xla_cc_test( name = "gpu_collective_performance_model_test", srcs = ["gpu_collective_performance_model_test.cc"], + tags = ["gpu"], deps = [ "//xla/service/gpu:backend_configs_cc", "//xla/tests:hlo_test_base", @@ -344,6 +353,7 @@ cc_library( name = "gpu_indexing_performance_model", srcs = ["gpu_indexing_performance_model.cc"], hdrs = ["gpu_indexing_performance_model.h"], + tags = ["gpu"], deps = [ ":coalescing_analysis", ":fusion_analysis_cache", @@ -384,6 +394,7 @@ cc_library( xla_cc_test( name = "gpu_indexing_performance_model_test", srcs = ["gpu_indexing_performance_model_test.cc"], + tags = ["gpu"], deps = [ ":fusion_analysis_cache", ":gpu_hlo_cost_analysis", @@ -791,6 +802,7 @@ cc_library( name = "coalescing_analysis", srcs = ["coalescing_analysis.cc"], hdrs = ["coalescing_analysis.h"], + tags = ["gpu"], deps = [ ":affine_map_evaluator", ":indexing_analysis", @@ -814,6 +826,7 @@ cc_library( xla_cc_test( name = "coalescing_analysis_test", srcs = ["coalescing_analysis_test.cc"], + tags = ["gpu"], deps = [ ":coalescing_analysis", ":symbolic_tile", diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD index f0eb7f9481a14..4cfe8ed5c4adf 100644 --- a/xla/service/gpu/tests/BUILD +++ b/xla/service/gpu/tests/BUILD @@ -674,6 +674,7 @@ lit_test_suite( ], default_tags = tf_cuda_tests_tags(), hermetic_cuda_data_dir = "%S/../../../../../cuda_nvcc", + tags = ["gpu"], tags_override = { "element_wise_row_vectorization.hlo": ["cuda-only"], "scatter_bf16.hlo": ["cuda-only"], @@ -702,6 +703,7 @@ lit_test_suite( # cc_binary( # name = "xla-opt", # srcs = ["xla-opt.cc"], +# tags = ["gpu"], # deps = [ # "//xla/service/gpu/fusions/transforms:passes", # "//xla/service/gpu/fusions/triton:passes", diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD index ed27641245360..0a2255048b8ba 100644 --- a/xla/service/gpu/transforms/BUILD +++ b/xla/service/gpu/transforms/BUILD @@ -3,10 +3,6 @@ load( "if_cuda_is_configured", ) load("//xla:xla.bzl", "xla_cc_test") -load( - "//xla/stream_executor:build_defs.bzl", - "if_gpu_is_configured", -) load("//xla/tests:build_defs.bzl", "xla_test") load("//xla/tsl:tsl.bzl", "if_google", "if_oss") @@ -300,7 +296,6 @@ xla_cc_test( "//xla:literal_util", "//xla/hlo/ir:hlo", "//xla/hlo/pass:hlo_pass", - "//xla/service:hlo_proto_cc", "//xla/tests:hlo_test_base", "//xla/tests:literal_test_util", "//xla/tests:verified_hlo_module", @@ -339,6 +334,7 @@ cc_library( name = "fusion_block_level_rewriter", srcs = ["fusion_block_level_rewriter.cc"], hdrs = ["fusion_block_level_rewriter.h"], + tags = ["gpu"], deps = [ "//xla/hlo/ir:hlo", "//xla/hlo/pass:hlo_pass", @@ -366,6 +362,7 @@ cc_library( xla_cc_test( name = "fusion_block_level_rewriter_test", srcs = ["fusion_block_level_rewriter_test.cc"], + tags = ["gpu"], deps = [ ":fusion_block_level_rewriter", "//xla:shape_util", @@ -532,10 +529,7 @@ cc_library( xla_test( name = "command_buffer_scheduling_test", srcs = ["command_buffer_scheduling_test.cc"], - backends = [ - "cpu", - "gpu", - ], + backends = ["gpu"], deps = [ ":command_buffer_scheduling", "//xla/hlo/ir:hlo", @@ -785,9 +779,6 @@ xla_cc_test( srcs = ["cudnn_custom_call_converter_test.cc"], deps = [ ":cudnn_custom_call_converter", - "//xla/hlo/ir:hlo", - "//xla/service:pattern_matcher", - "//xla/service:pattern_matcher_gmock", "//xla/tests:hlo_test_base", "//xla/tests:xla_internal_test_main", "@com_google_googletest//:gtest", @@ -930,6 +921,7 @@ xla_test( ":cudnn_fused_mha_rewriter", ":cudnn_fused_mha_transpose_fusion", "//xla:error_spec", + "//xla:shape_util", "//xla:test_helpers", "//xla:util", "//xla:xla_data_proto_cc", @@ -991,9 +983,20 @@ cc_library( # Tested via //third_party/tensorflow/compiler/xla/service/gpu/fusions:cudnn_test cc_library( name = "cudnn_fusion_compiler", - srcs = if_cuda_is_configured(["cudnn_fusion_compiler.cc"]), - hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]), - deps = if_cuda_is_configured([ + srcs = ["cudnn_fusion_compiler.cc"], + hdrs = ["cudnn_fusion_compiler.h"], + tags = [ + "cuda-only", + "gpu", + ], + deps = [ + "//xla:comparison_util", + "//xla:shape_util", + "//xla:util", + "//xla/hlo/ir:hlo", + "//xla/hlo/pass:hlo_pass", + "//xla/hlo/utils:hlo_query", + "//xla/service:dump", "//xla/service/gpu:backend_configs_cc", "//xla/service/gpu:cudnn_support_utils", "//xla/service/gpu:ir_emission_utils", @@ -1001,6 +1004,10 @@ cc_library( "//xla/service/gpu:matmul_utils", "//xla/service/gpu:stream_executor_util", "//xla/service/gpu:triton_fusion_analysis", + "//xla/stream_executor:dnn", + "//xla/stream_executor:stream_executor_h", + "//xla/stream_executor/cuda:cudnn_frontend_helpers", + "//xla/stream_executor/cuda:cudnn_plugin", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -1010,27 +1017,19 @@ cc_library( "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings:string_view", "@local_config_cuda//cuda:cudnn_header", - "//xla:shape_util", - "//xla:comparison_util", - "//xla:util", - "//xla/hlo/ir:hlo", - "//xla/hlo/utils:hlo_query", - "//xla/hlo/pass:hlo_pass", - "//xla/stream_executor:dnn", - "//xla/stream_executor:stream_executor_h", - "//xla/service:dump", - "//xla/stream_executor/cuda:cudnn_frontend_helpers", - "//xla/stream_executor/cuda:cudnn_plugin", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:statusor", - ]), + ], ) cc_library( name = "cudnn_norm_rewriter", srcs = ["cudnn_norm_rewriter.cc"], hdrs = ["cudnn_norm_rewriter.h"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = [ + "cuda-only", + "gpu", + ], deps = [ "//xla:shape_util", "//xla:types", @@ -1051,13 +1050,11 @@ cc_library( "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", "@com_google_absl//absl/types:span", + "@local_config_cuda//cuda:cudnn_header", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:logging", "@tsl//tsl/platform:statusor", - ] + if_cuda_is_configured([ - "@local_config_cuda//cuda:cuda_headers", - "@local_config_cuda//cuda:cudnn_header", - ]) + if_google([ + ] + if_google([ "@com_google_protobuf//:wrappers_cc_proto", ]), ) @@ -1066,7 +1063,7 @@ xla_test( name = "cudnn_norm_rewriter_test", srcs = ["cudnn_norm_rewriter_test.cc"], backends = ["gpu"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["cuda-only"], deps = [ ":cudnn_norm_rewriter", "//xla:error_spec", @@ -1076,10 +1073,9 @@ xla_test( "//xla/tests:filecheck", "//xla/tsl/lib/core:status_test_util", "@com_google_googletest//:gtest_main", - ] + if_cuda_is_configured([ "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cudnn_header", - ]), + ], ) cc_library( @@ -1238,34 +1234,37 @@ xla_cc_test( # TODO(b/358278858): Currently lacking test coverage. cc_library( name = "cudnn_custom_call_compiler", - srcs = if_cuda_is_configured(["cudnn_custom_call_compiler.cc"]), - hdrs = if_cuda_is_configured(["cudnn_custom_call_compiler.h"]), - deps = if_cuda_is_configured([ - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/container:inlined_vector", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - "@local_config_cuda//cuda:cudnn_header", + srcs = ["cudnn_custom_call_compiler.cc"], + hdrs = ["cudnn_custom_call_compiler.h"], + tags = [ + "cuda-only", + "gpu", + ], + deps = [ "//xla:shape_util", "//xla:status_macros", "//xla:util", "//xla/hlo/ir:hlo", "//xla/hlo/pass:hlo_pass", - "//xla/service/gpu/runtime:cudnn_thunk", "//xla/service/gpu:backend_configs_cc", "//xla/service/gpu:cublas_cudnn", "//xla/service/gpu:ir_emission_utils", "//xla/service/gpu:stream_executor_util", + "//xla/service/gpu/runtime:cudnn_thunk", "//xla/stream_executor:dnn", "//xla/stream_executor:stream_executor_h", "//xla/stream_executor/cuda:cudnn_frontend_helpers", "//xla/stream_executor/cuda:cudnn_plugin", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:statusor", - ]) + ["@com_google_absl//absl/container:flat_hash_map"], + ], ) cc_library( @@ -1363,31 +1362,25 @@ cc_library( xla_test( name = "dot_operand_converter_test", - srcs = if_gpu_is_configured(["dot_operand_converter_test.cc"]), + srcs = ["dot_operand_converter_test.cc"], backends = [ "gpu_a100", "gpu_p100", "gpu_v100", "gpu_amd_any", ], - deps = if_gpu_is_configured( - [ - ":dot_operand_converter", - "@com_google_googletest//:gtest", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:string_view", - "//xla:shape_util", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/hlo/utils:hlo_matchers", - "//xla/service:pattern_matcher", - "//xla/tests:hlo_test_base", - "//xla/tests:xla_internal_test_main", - "@tsl//tsl/platform:statusor", - ], - ) + [ - # b/317293391 - "@tsl//tsl/platform:test_main", + deps = [ + ":dot_operand_converter", + "//xla:shape_util", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/hlo/utils:hlo_matchers", + "//xla/tests:hlo_test_base", + "//xla/tests:xla_internal_test_main", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:string_view", + "@com_google_googletest//:gtest", + "@tsl//tsl/platform:statusor", ], ) @@ -1527,19 +1520,14 @@ xla_cc_test( "//xla/ffi", "//xla/ffi:ffi_api", "//xla/hlo/ir:hlo", - "//xla/service:buffer_value", "//xla/service:custom_call_target_registry", - "//xla/service:executable", - "//xla/service:hlo_memory_scheduler", "//xla/service:hlo_module_config", "//xla/service/gpu:gpu_device_info_for_tests", "//xla/stream_executor", "//xla/stream_executor/gpu:gpu_types_header", "//xla/tests:filecheck", "//xla/tests:hlo_test_base", - "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/status", - "@tsl//tsl/platform:status", "@tsl//tsl/platform:statusor", "@tsl//tsl/platform:test", "@tsl//tsl/platform:test_main", @@ -1550,6 +1538,7 @@ cc_library( name = "fusion_merger", srcs = ["fusion_merger.cc"], hdrs = ["fusion_merger.h"], + tags = ["gpu"], deps = [ "//xla:shape_util", "//xla:util", @@ -1578,6 +1567,7 @@ xla_cc_test( name = "fusion_merger_test", srcs = ["fusion_merger_test.cc"], tags = [ + "gpu", "nomsan", ], deps = [ @@ -1835,13 +1825,10 @@ xla_cc_test( # TODO(b/358278858): Currently lacking test coverage. cc_library( name = "gpusolver_rewriter", - srcs = if_gpu_is_configured(["gpusolver_rewriter.cc"]), - hdrs = if_gpu_is_configured(["gpusolver_rewriter.h"]), - deps = if_gpu_is_configured([ - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", + srcs = ["gpusolver_rewriter.cc"], + hdrs = ["gpusolver_rewriter.h"], + tags = ["gpu"], + deps = [ "//xla:comparison_util", "//xla:literal", "//xla:literal_util", @@ -1852,14 +1839,15 @@ cc_library( "//xla/hlo/pass:hlo_pass", "//xla/service/gpu:cusolver_context", "//xla/service/gpu:ir_emission_utils", - "//xla/stream_executor", "//xla/stream_executor:blas", - "//xla/stream_executor:device_memory_allocator", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:logging", - "@tsl//tsl/platform:status", "@tsl//tsl/platform:statusor", - ]), + ], ) cc_library( @@ -2103,6 +2091,7 @@ cc_library( name = "multi_output_fusion", srcs = ["multi_output_fusion.cc"], hdrs = ["multi_output_fusion.h"], + tags = ["gpu"], deps = [ "//xla:debug_options_flags", "//xla:shape_util", @@ -2134,6 +2123,7 @@ xla_cc_test( name = "multi_output_fusion_test", srcs = ["multi_output_fusion_test.cc"], tags = [ + "gpu", "nomsan", ], deps = [ @@ -2252,6 +2242,7 @@ cc_library( name = "priority_fusion", srcs = ["priority_fusion.cc"], hdrs = ["priority_fusion.h"], + tags = ["gpu"], deps = [ "//xla:debug_options_flags", "//xla:shape_util", @@ -2302,7 +2293,7 @@ cc_library( xla_cc_test( name = "priority_fusion_test", srcs = ["priority_fusion_test.cc"], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]), + tags = ["gpu"], deps = [ ":priority_fusion", "//xla:shape_util", @@ -2688,6 +2679,7 @@ cc_library( name = "softmax_rewriter_triton", srcs = ["softmax_rewriter_triton.cc"], hdrs = ["softmax_rewriter_triton.h"], + tags = ["gpu"], deps = [ ":reduction_dimension_grouper", ":reduction_splitter", @@ -2734,6 +2726,7 @@ cc_library( xla_cc_test( name = "softmax_rewriter_triton_test", srcs = ["softmax_rewriter_triton_test.cc"], + tags = ["gpu"], deps = [ ":softmax_rewriter_triton", "//xla:shape_util", @@ -2759,11 +2752,9 @@ xla_cc_test( cc_library( name = "sort_rewriter", - srcs = if_gpu_is_configured( - ["sort_rewriter.cc"], - ["sort_rewriter_stub.cc"], - ), + srcs = ["sort_rewriter.cc"], hdrs = ["sort_rewriter.h"], + tags = ["gpu"], deps = [ "//xla:comparison_util", "//xla:shape_util", @@ -2786,7 +2777,7 @@ cc_library( xla_test( name = "sort_rewriter_test", - srcs = if_cuda_is_configured(["sort_rewriter_test.cc"]), + srcs = ["sort_rewriter_test.cc"], backends = ["gpu"], tags = ["no_oss"], deps = [ diff --git a/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc b/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc index a64fd0624bea6..d87490fcfec47 100644 --- a/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc +++ b/xla/service/gpu/transforms/cudnn_fused_mha_rewriter_test.cc @@ -18,13 +18,13 @@ limitations under the License. #include #include #include -#include #include #include #include "absl/algorithm/container.h" #include "absl/strings/string_view.h" -#include "xla/error_spec.h" +#include "third_party/gpus/cuda/include/cuda.h" +#include "third_party/gpus/cudnn/cudnn.h" // IWYU pragma: keep #include "xla/hlo/ir/hlo_opcode.h" #include "xla/service/algebraic_simplifier.h" #include "xla/service/computation_layout.h" @@ -40,6 +40,7 @@ limitations under the License. #include "xla/service/pattern_matcher.h" #include "xla/service/pattern_matcher_gmock.h" #include "xla/service/reshape_decomposer.h" +#include "xla/shape_util.h" #include "xla/stream_executor/device_description.h" #include "xla/stream_executor/dnn.h" #include "xla/test_helpers.h" @@ -49,11 +50,6 @@ limitations under the License. #include "xla/xla_data.pb.h" #include "tsl/platform/statusor.h" -#if GOOGLE_CUDA -#include "third_party/gpus/cuda/include/cuda.h" -#include "third_party/gpus/cudnn/cudnn.h" // IWYU pragma: keep -#endif - namespace xla { namespace gpu { namespace { @@ -87,7 +83,7 @@ class CudnnFusedMhaRewriterTestHloTest : public HloTestBase { : HloTestBase(/*verifier_layout_sensitive=*/false, /*allow_mixed_precision_in_hlo_verifier=*/false, /*instruction_can_change_layout_func=*/{}) { -#if !defined(GOOGLE_CUDA) || CUDA_VERSION < 12000 +#if CUDA_VERSION < 12000 skip_reason_ = "cuDNN fused MHA requires CUDA 12 or later."; return; #endif diff --git a/xla/service/gpu/transforms/cudnn_norm_rewriter.cc b/xla/service/gpu/transforms/cudnn_norm_rewriter.cc index 752549dc7ec50..8bbdfde9118c7 100644 --- a/xla/service/gpu/transforms/cudnn_norm_rewriter.cc +++ b/xla/service/gpu/transforms/cudnn_norm_rewriter.cc @@ -49,12 +49,7 @@ limitations under the License. #include "tsl/platform/errors.h" #include "tsl/platform/logging.h" #include "tsl/platform/statusor.h" - -#if GOOGLE_CUDA -#include "third_party/gpus/cuda/include/cuda.h" // IWYU pragma: keep -#include "third_party/gpus/cudnn/cudnn.h" // IWYU pragma: keep #include "third_party/gpus/cudnn/cudnn_version.h" -#endif namespace xla { namespace gpu { diff --git a/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc b/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc index a3dbc71132949..255799b84c174 100644 --- a/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc +++ b/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc @@ -16,16 +16,12 @@ limitations under the License. #include #include -#include "xla/error_spec.h" -#include "xla/stream_executor/device_description.h" - -#if GOOGLE_CUDA #include "third_party/gpus/cuda/include/cuda.h" #include "third_party/gpus/cudnn/cudnn.h" // IWYU pragma: keep #include "third_party/gpus/cudnn/cudnn_version.h" -#endif - +#include "xla/error_spec.h" #include "xla/service/gpu/tests/gpu_codegen_test.h" +#include "xla/stream_executor/device_description.h" namespace xla { namespace gpu { diff --git a/xla/service/gpu/transforms/sort_rewriter_stub.cc b/xla/service/gpu/transforms/sort_rewriter_stub.cc deleted file mode 100644 index e9bf60cdb4c9b..0000000000000 --- a/xla/service/gpu/transforms/sort_rewriter_stub.cc +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2024 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "absl/container/flat_hash_set.h" -#include "absl/strings/string_view.h" -#include "xla/hlo/ir/hlo_computation.h" -#include "xla/hlo/ir/hlo_instruction.h" -#include "xla/hlo/ir/hlo_instructions.h" -#include "xla/hlo/ir/hlo_module.h" -#include "xla/service/gpu/transforms/sort_rewriter.h" -#include "tsl/platform/statusor.h" - -namespace xla { -namespace gpu { - -absl::StatusOr SortRewriter::RunOnInstruction( - HloSortInstruction* sort_op) { - return false; -} - -absl::StatusOr SortRewriter::RunOnComputation( - HloComputation* computation) { - return false; -} - -absl::StatusOr SortRewriter::Run( - HloModule* module, - const absl::flat_hash_set& execution_threads) { - return false; -} - -} // namespace gpu -} // namespace xla diff --git a/xla/tools/hlo_opt/BUILD b/xla/tools/hlo_opt/BUILD index 0a86b663a69df..f4b1c028b393b 100644 --- a/xla/tools/hlo_opt/BUILD +++ b/xla/tools/hlo_opt/BUILD @@ -55,7 +55,7 @@ cc_library( cc_library( name = "gpu_opt", testonly = True, - srcs = if_gpu_is_configured(["gpu_opt.cc"]), + srcs = ["gpu_opt.cc"], tags = ["gpu"], deps = [ ":opt_lib", @@ -66,12 +66,14 @@ cc_library( "//xla/service:compiler", "//xla/service:dump", "//xla/service:executable", + "//xla/service:gpu_plugin", "//xla/service:hlo_graph_dumper", "//xla/service:platform_util", "//xla/service/gpu:buffer_sharing", "//xla/service/gpu:compile_module_to_llvm_ir", "//xla/service/gpu:executable_proto_cc", "//xla/service/gpu:gpu_compiler", + "//xla/service/gpu:gpu_executable", "//xla/service/gpu:gpu_hlo_schedule", "//xla/service/llvm_ir:llvm_util", "//xla/stream_executor", @@ -82,10 +84,7 @@ cc_library( "@llvm-project//llvm:ir_headers", "@tsl//tsl/platform:errors", "@tsl//tsl/platform:statusor", - ] + if_gpu_is_configured([ - "//xla/service:gpu_plugin", - "//xla/service/gpu:gpu_executable", - ]) + if_cuda_is_configured([ + ] + if_cuda_is_configured([ "//xla/stream_executor:cuda_platform", ]) + if_rocm_is_configured([ "//xla/stream_executor:rocm_platform", @@ -176,6 +175,7 @@ lit_test_suite( data = [":test_utilities"], default_tags = tf_cuda_tests_tags(), hermetic_cuda_data_dir = "%S/../../../../cuda_nvcc", + tags = ["gpu"], tags_override = { "gpu_hlo_ptx.hlo": ["cuda-only"], },