From 1c58aacbc8a0d40408368ce7551aa1601300eea7 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 23 Jul 2024 12:34:17 -0700
Subject: [PATCH] [dtensor] move ops to private (#131211)

as titled

Differential Revision: [D60132519](https://our.internmc.facebook.com/intern/diff/D60132519)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/131211
Approved by: https://github.com/XilunWu, https://github.com/wz337
ghstack dependencies: #131212
---
 test/distributed/_tensor/test_common_rules.py  |  2 +-
 test/distributed/_tensor/test_embedding_ops.py |  2 +-
 test/distributed/_tensor/test_op_strategy.py   |  8 ++++----
 test/distributed/_tensor/test_tensor_ops.py    |  2 +-
 test/distributed/_tensor/test_view_ops.py      |  2 +-
 torch/distributed/_spmd/batch_dim_utils.py     |  2 +-
 torch/distributed/_spmd/experimental_ops.py    |  2 +-
 torch/distributed/_tensor/ops/__init__.py      | 18 +++++++++---------
 .../ops/{common_rules.py => _common_rules.py}  |  0
 .../_tensor/ops/{conv_ops.py => _conv_ops.py}  |  0
 .../{basic_strategy.py => _einsum_strategy.py} |  0
 .../{embedding_ops.py => _embedding_ops.py}    |  0
 ...xperimental_ops.py => _experimental_ops.py} |  0
 .../_tensor/ops/{math_ops.py => _math_ops.py}  |  0
 .../ops/{matrix_ops.py => _matrix_ops.py}      |  2 +-
 .../{pointwise_ops.py => _pointwise_ops.py}    |  0
 .../ops/{random_ops.py => _random_ops.py}      |  0
 .../ops/{tensor_ops.py => _tensor_ops.py}      |  4 ++--
 .../_tensor/ops/{view_ops.py => _view_ops.py}  |  0
 torch/distributed/tensor/parallel/loss.py      |  4 ++--
 20 files changed, 24 insertions(+), 24 deletions(-)
 rename torch/distributed/_tensor/ops/{common_rules.py => _common_rules.py} (100%)
 rename torch/distributed/_tensor/ops/{conv_ops.py => _conv_ops.py} (100%)
 rename torch/distributed/_tensor/ops/{basic_strategy.py => _einsum_strategy.py} (100%)
 rename torch/distributed/_tensor/ops/{embedding_ops.py => _embedding_ops.py} (100%)
 rename torch/distributed/_tensor/ops/{experimental_ops.py => _experimental_ops.py} (100%)
 rename torch/distributed/_tensor/ops/{math_ops.py => _math_ops.py} (100%)
 rename torch/distributed/_tensor/ops/{matrix_ops.py => _matrix_ops.py} (99%)
 rename torch/distributed/_tensor/ops/{pointwise_ops.py => _pointwise_ops.py} (100%)
 rename torch/distributed/_tensor/ops/{random_ops.py => _random_ops.py} (100%)
 rename torch/distributed/_tensor/ops/{tensor_ops.py => _tensor_ops.py} (99%)
 rename torch/distributed/_tensor/ops/{view_ops.py => _view_ops.py} (100%)

diff --git a/test/distributed/_tensor/test_common_rules.py b/test/distributed/_tensor/test_common_rules.py
index 895fb4186020b..77b5d91405a73 100644
--- a/test/distributed/_tensor/test_common_rules.py
+++ b/test/distributed/_tensor/test_common_rules.py
@@ -4,7 +4,7 @@
 import torch
 from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor._op_schema import OpSchema
-from torch.distributed._tensor.ops.common_rules import einop_rule, pointwise_rule
+from torch.distributed._tensor.ops._common_rules import einop_rule, pointwise_rule
 from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
diff --git a/test/distributed/_tensor/test_embedding_ops.py b/test/distributed/_tensor/test_embedding_ops.py
index 4eb78136aabf5..7822962864cdf 100644
--- a/test/distributed/_tensor/test_embedding_ops.py
+++ b/test/distributed/_tensor/test_embedding_ops.py
@@ -167,7 +167,7 @@ def test_sharded_embedding_rowwise(self):
         self._run_embedding_op_test(mesh, 0, [6, 7, 6], 13, 22)
         self._run_embedding_op_test(mesh, 0, [34], 15, 14, padding_idx=10)
 
-        from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+        from torch.distributed._tensor.ops._embedding_ops import _MaskPartial
 
         # test collectives
         embedding_mod = torch.nn.Embedding(10, 20, device=self.device_type)
diff --git a/test/distributed/_tensor/test_op_strategy.py b/test/distributed/_tensor/test_op_strategy.py
index d6513f5c750fd..302e2675cc899 100644
--- a/test/distributed/_tensor/test_op_strategy.py
+++ b/test/distributed/_tensor/test_op_strategy.py
@@ -6,7 +6,7 @@
 from torch.distributed._tensor import DeviceMesh, DTensor
 from torch.distributed._tensor._collective_utils import redistribute_cost
 from torch.distributed._tensor._op_schema import OpSchema, OpStrategy, PlacementStrategy
-from torch.distributed._tensor.ops.basic_strategy import (
+from torch.distributed._tensor.ops._einsum_strategy import (
     EinsumDims,
     gen_einsum_strategies,
 )
@@ -169,7 +169,7 @@ def test_redistribute_cost_mesh_1d(self):
 
     def test_redistribute_cost_latency(self):
         # test cost model on addmm op
-        from torch.distributed._tensor.ops.matrix_ops import addmm_strategy
+        from torch.distributed._tensor.ops._matrix_ops import addmm_strategy
 
         mesh = self.build_device_mesh()
         shard0_placement = (Shard(0),)
@@ -246,7 +246,7 @@ def test_redistribute_cost_mesh_2d(self):
         self.assertTrue(allreduce_cost > reduce_scatter_cost)
 
     def test_mm_strategies(self):
-        from torch.distributed._tensor.ops.matrix_ops import mm_strategy
+        from torch.distributed._tensor.ops._matrix_ops import mm_strategy
 
         mesh = self.build_device_mesh()
         lhs_tensor = torch.randn(6, 8)
@@ -292,7 +292,7 @@ def test_mm_strategies(self):
             self.assertFalse(output_sharding.needs_redistribute)
 
     def test_bmm_strategies(self):
-        from torch.distributed._tensor.ops.matrix_ops import bmm_strategy
+        from torch.distributed._tensor.ops._matrix_ops import bmm_strategy
 
         mesh = self.build_device_mesh()
         lhs_tensor = torch.randn(8, 6, 8)
diff --git a/test/distributed/_tensor/test_tensor_ops.py b/test/distributed/_tensor/test_tensor_ops.py
index 539a038372e26..1cb8ae51104d6 100644
--- a/test/distributed/_tensor/test_tensor_ops.py
+++ b/test/distributed/_tensor/test_tensor_ops.py
@@ -445,7 +445,7 @@ def test_gather(self):
         # case 2 input sharding: input sharded, index replicated, output mask partial
         # only works when index has size 1 on the gather dimension and
         # input is sharded on the gather dimension
-        from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+        from torch.distributed._tensor.ops._embedding_ops import _MaskPartial
 
         gather_dim = 1
         global_input = torch.randn(12, 8, 16)
diff --git a/test/distributed/_tensor/test_view_ops.py b/test/distributed/_tensor/test_view_ops.py
index 2ea89e34789bf..8ace53d97131b 100644
--- a/test/distributed/_tensor/test_view_ops.py
+++ b/test/distributed/_tensor/test_view_ops.py
@@ -9,7 +9,7 @@
 from torch import rand, randn, Tensor
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
 from torch.distributed._tensor.debug import CommDebugMode
-from torch.distributed._tensor.ops.view_ops import (
+from torch.distributed._tensor.ops._view_ops import (
     Broadcast,
     dim_maps,
     Flatten,
diff --git a/torch/distributed/_spmd/batch_dim_utils.py b/torch/distributed/_spmd/batch_dim_utils.py
index 244cc26c55ed4..012b2414eb6c3 100644
--- a/torch/distributed/_spmd/batch_dim_utils.py
+++ b/torch/distributed/_spmd/batch_dim_utils.py
@@ -6,7 +6,7 @@
 import torch.utils._pytree as pytree
 from torch import Tensor
 from torch.distributed._tensor import DeviceMesh, Replicate, Shard
-from torch.distributed._tensor.ops.view_ops import dim_maps, DimSpec, InputDim
+from torch.distributed._tensor.ops._view_ops import dim_maps, DimSpec, InputDim
 from torch.distributed._tensor.placement_types import _Partial, DTensorSpec
 
 
diff --git a/torch/distributed/_spmd/experimental_ops.py b/torch/distributed/_spmd/experimental_ops.py
index f8c8f8804c579..1624c43afa6dc 100644
--- a/torch/distributed/_spmd/experimental_ops.py
+++ b/torch/distributed/_spmd/experimental_ops.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch.distributed._tensor._op_schema import OpSchema, OutputSharding
-from torch.distributed._tensor.ops.common_rules import pointwise_rule
+from torch.distributed._tensor.ops._common_rules import pointwise_rule
 from torch.distributed._tensor.ops.utils import register_prop_rule
 from torch.distributed._tensor.placement_types import (
     _Partial,
diff --git a/torch/distributed/_tensor/ops/__init__.py b/torch/distributed/_tensor/ops/__init__.py
index eaccc8aa8d3f6..dec4665b1c8b9 100644
--- a/torch/distributed/_tensor/ops/__init__.py
+++ b/torch/distributed/_tensor/ops/__init__.py
@@ -1,10 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from .conv_ops import *  # noqa: F403
-from .embedding_ops import *  # noqa: F403
-from .experimental_ops import *  # noqa: F403
-from .math_ops import *  # noqa: F403
-from .matrix_ops import *  # noqa: F403
-from .pointwise_ops import *  # noqa: F403
-from .random_ops import *  # noqa: F403
-from .tensor_ops import *  # noqa: F403
-from .view_ops import *  # noqa: F403
+from ._conv_ops import *  # noqa: F403
+from ._embedding_ops import *  # noqa: F403
+from ._experimental_ops import *  # noqa: F403
+from ._math_ops import *  # noqa: F403
+from ._matrix_ops import *  # noqa: F403
+from ._pointwise_ops import *  # noqa: F403
+from ._random_ops import *  # noqa: F403
+from ._tensor_ops import *  # noqa: F403
+from ._view_ops import *  # noqa: F403
diff --git a/torch/distributed/_tensor/ops/common_rules.py b/torch/distributed/_tensor/ops/_common_rules.py
similarity index 100%
rename from torch/distributed/_tensor/ops/common_rules.py
rename to torch/distributed/_tensor/ops/_common_rules.py
diff --git a/torch/distributed/_tensor/ops/conv_ops.py b/torch/distributed/_tensor/ops/_conv_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/conv_ops.py
rename to torch/distributed/_tensor/ops/_conv_ops.py
diff --git a/torch/distributed/_tensor/ops/basic_strategy.py b/torch/distributed/_tensor/ops/_einsum_strategy.py
similarity index 100%
rename from torch/distributed/_tensor/ops/basic_strategy.py
rename to torch/distributed/_tensor/ops/_einsum_strategy.py
diff --git a/torch/distributed/_tensor/ops/embedding_ops.py b/torch/distributed/_tensor/ops/_embedding_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/embedding_ops.py
rename to torch/distributed/_tensor/ops/_embedding_ops.py
diff --git a/torch/distributed/_tensor/ops/experimental_ops.py b/torch/distributed/_tensor/ops/_experimental_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/experimental_ops.py
rename to torch/distributed/_tensor/ops/_experimental_ops.py
diff --git a/torch/distributed/_tensor/ops/math_ops.py b/torch/distributed/_tensor/ops/_math_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/math_ops.py
rename to torch/distributed/_tensor/ops/_math_ops.py
diff --git a/torch/distributed/_tensor/ops/matrix_ops.py b/torch/distributed/_tensor/ops/_matrix_ops.py
similarity index 99%
rename from torch/distributed/_tensor/ops/matrix_ops.py
rename to torch/distributed/_tensor/ops/_matrix_ops.py
index 2815d14d9490a..8b919254f116c 100644
--- a/torch/distributed/_tensor/ops/matrix_ops.py
+++ b/torch/distributed/_tensor/ops/_matrix_ops.py
@@ -9,7 +9,7 @@
     PlacementList,
     PlacementStrategy,
 )
-from torch.distributed._tensor.ops.basic_strategy import gen_einsum_strategies
+from torch.distributed._tensor.ops._einsum_strategy import gen_einsum_strategies
 from torch.distributed._tensor.ops.utils import (
     expand_to_full_mesh_op_strategy,
     generate_redistribute_costs,
diff --git a/torch/distributed/_tensor/ops/pointwise_ops.py b/torch/distributed/_tensor/ops/_pointwise_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/pointwise_ops.py
rename to torch/distributed/_tensor/ops/_pointwise_ops.py
diff --git a/torch/distributed/_tensor/ops/random_ops.py b/torch/distributed/_tensor/ops/_random_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/random_ops.py
rename to torch/distributed/_tensor/ops/_random_ops.py
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/_tensor_ops.py
similarity index 99%
rename from torch/distributed/_tensor/ops/tensor_ops.py
rename to torch/distributed/_tensor/ops/_tensor_ops.py
index e87fbb53c7c6b..223ff0674ec6e 100644
--- a/torch/distributed/_tensor/ops/tensor_ops.py
+++ b/torch/distributed/_tensor/ops/_tensor_ops.py
@@ -15,8 +15,8 @@
     StrategyType,
     TupleStrategy,
 )
-from torch.distributed._tensor.ops.common_rules import pointwise_rule
-from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+from torch.distributed._tensor.ops._common_rules import pointwise_rule
+from torch.distributed._tensor.ops._embedding_ops import _MaskPartial
 from torch.distributed._tensor.ops.utils import (
     expand_to_full_mesh_op_strategy,
     is_tensor_dim_sharded,
diff --git a/torch/distributed/_tensor/ops/view_ops.py b/torch/distributed/_tensor/ops/_view_ops.py
similarity index 100%
rename from torch/distributed/_tensor/ops/view_ops.py
rename to torch/distributed/_tensor/ops/_view_ops.py
diff --git a/torch/distributed/tensor/parallel/loss.py b/torch/distributed/tensor/parallel/loss.py
index 82295c4f4e4f2..ead6ccaea889f 100644
--- a/torch/distributed/tensor/parallel/loss.py
+++ b/torch/distributed/tensor/parallel/loss.py
@@ -9,8 +9,8 @@
 import torch.distributed.distributed_c10d as c10d
 from torch import Tensor
 from torch.distributed._tensor import DTensor, Replicate, Shard
-from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
-from torch.distributed._tensor.ops.math_ops import (
+from torch.distributed._tensor.ops._embedding_ops import _MaskPartial
+from torch.distributed._tensor.ops._math_ops import (
     _skip_dim,
     Reduction,
     replicate_reduction_dims,