Skip to content

Commit

Permalink
Disable certain pt2 tests for oss ci (pytorch#1917)
Browse files Browse the repository at this point in the history
Summary:

Errors are like this 
```

2024-04-17T23:58:47.8254890Z self = <torchrec.distributed.tests.test_infer_shardings.InferShardingsTest testMethod=test_sharded_quant_fp_ebc_tw_meta>
2024-04-17T23:58:47.8254896Z 
2024-04-17T23:58:47.8255166Z     def test_sharded_quant_fp_ebc_tw_meta(self) -> None:
2024-04-17T23:58:47.8255446Z         # Simulate inference, take unsharded cpu model and shard on meta
2024-04-17T23:58:47.8255727Z         # Use PositionWeightedModuleCollection, FP used in production
2024-04-17T23:58:47.8255813Z     
2024-04-17T23:58:47.8255935Z         num_embeddings = 10
2024-04-17T23:58:47.8256032Z         emb_dim = 16
2024-04-17T23:58:47.8256130Z         world_size = 2
2024-04-17T23:58:47.8256241Z         batch_size = 2
2024-04-17T23:58:47.8256381Z         local_device = torch.device("cpu")
2024-04-17T23:58:47.8256466Z     
2024-04-17T23:58:47.8256789Z         topology: Topology = Topology(world_size=world_size, compute_device="cpu")
2024-04-17T23:58:47.8256906Z         mi = TestModelInfo(
2024-04-17T23:58:47.8257040Z             dense_device=local_device,
2024-04-17T23:58:47.8257220Z             sparse_device=local_device,
2024-04-17T23:58:47.8257324Z             num_features=2,
2024-04-17T23:58:47.8257450Z             num_float_features=10,
2024-04-17T23:58:47.8257568Z             num_weighted_features=0,
2024-04-17T23:58:47.8257678Z             topology=topology,
2024-04-17T23:58:47.8257777Z         )
2024-04-17T23:58:47.8257947Z         mi.planner = EmbeddingShardingPlanner(
2024-04-17T23:58:47.8258055Z             topology=topology,
2024-04-17T23:58:47.8258180Z             batch_size=batch_size,
2024-04-17T23:58:47.8258324Z             enumerator=EmbeddingEnumerator(
2024-04-17T23:58:47.8258435Z                 topology=topology,
2024-04-17T23:58:47.8258565Z                 batch_size=batch_size,
2024-04-17T23:58:47.8258670Z                 estimator=[
2024-04-17T23:58:47.8258953Z                     EmbeddingPerfEstimator(topology=topology, is_inference=True),
2024-04-17T23:58:47.8259182Z                     EmbeddingStorageEstimator(topology=topology),
2024-04-17T23:58:47.8259280Z                 ],
2024-04-17T23:58:47.8259369Z             ),
2024-04-17T23:58:47.8259471Z         )
2024-04-17T23:58:47.8259557Z     
2024-04-17T23:58:47.8259659Z         mi.tables = [
2024-04-17T23:58:47.8259790Z             EmbeddingBagConfig(
2024-04-17T23:58:47.8259933Z                 num_embeddings=num_embeddings,
2024-04-17T23:58:47.8260068Z                 embedding_dim=emb_dim,
2024-04-17T23:58:47.8260178Z                 name=f"table_{i}",
2024-04-17T23:58:47.8260322Z                 feature_names=[f"feature_{i}"],
2024-04-17T23:58:47.8260424Z             )
2024-04-17T23:58:47.8260561Z             for i in range(mi.num_features)
2024-04-17T23:58:47.8260649Z         ]
2024-04-17T23:58:47.8260748Z     
2024-04-17T23:58:47.8260903Z         max_feature_lengths = {"feature_0": 20}
2024-04-17T23:58:47.8260988Z     
2024-04-17T23:58:47.8261235Z         mi.model = KJTInputWrapper(
2024-04-17T23:58:47.8261396Z             module_kjt_input=torch.nn.Sequential(
2024-04-17T23:58:47.8261606Z                 FeatureProcessedEmbeddingBagCollection(
2024-04-17T23:58:47.8261757Z                     EmbeddingBagCollection(
2024-04-17T23:58:47.8261876Z                         tables=mi.tables,
2024-04-17T23:58:47.8261995Z                         is_weighted=True,
2024-04-17T23:58:47.8262148Z                         device=mi.sparse_device,
2024-04-17T23:58:47.8262243Z                     ),
2024-04-17T23:58:47.8262433Z                     PositionWeightedModuleCollection(
2024-04-17T23:58:47.8262624Z                         max_feature_lengths=max_feature_lengths,
2024-04-17T23:58:47.8262764Z                         device=mi.sparse_device,
2024-04-17T23:58:47.8262871Z                     ),
2024-04-17T23:58:47.8262961Z                 )
2024-04-17T23:58:47.8263052Z             )
2024-04-17T23:58:47.8263155Z         )
2024-04-17T23:58:47.8263438Z         model_inputs: List[ModelInput] = prep_inputs(
2024-04-17T23:58:47.8263659Z             mi, world_size, batch_size, long_indices=False, count=1
2024-04-17T23:58:47.8263766Z         )
2024-04-17T23:58:47.8263864Z         inputs = []
2024-04-17T23:58:47.8264009Z         kjt = model_inputs[0].idlist_features
2024-04-17T23:58:47.8264140Z         kjt = kjt.to(local_device)
2024-04-17T23:58:47.8264254Z         weights = torch.rand(
2024-04-17T23:58:47.8264495Z             kjt._values.size(0), dtype=torch.float, device=local_device
2024-04-17T23:58:47.8264599Z         )
2024-04-17T23:58:47.8264684Z     
2024-04-17T23:58:47.8264780Z         inputs = [
2024-04-17T23:58:47.8264888Z             kjt._keys,
2024-04-17T23:58:47.8264986Z             kjt._values,
2024-04-17T23:58:47.8265079Z             weights,
2024-04-17T23:58:47.8265189Z             kjt._lengths,
2024-04-17T23:58:47.8265286Z             kjt._offsets,
2024-04-17T23:58:47.8265375Z         ]
2024-04-17T23:58:47.8265472Z     
2024-04-17T23:58:47.8265575Z         mi.model(*inputs)
2024-04-17T23:58:47.8265700Z         print(f"model:\n{mi.model}")
2024-04-17T23:58:47.8265858Z     
2024-04-17T23:58:47.8265993Z         mi.quant_model = quantize_fpebc(
2024-04-17T23:58:47.8266103Z             module=mi.model,
2024-04-17T23:58:47.8266219Z             inplace=False,
2024-04-17T23:58:47.8266329Z             register_tbes=True,
2024-04-17T23:58:47.8266504Z             quant_state_dict_split_scale_bias=False,
2024-04-17T23:58:47.8266621Z             weight_dtype=torch.int8,
2024-04-17T23:58:47.8266708Z         )
2024-04-17T23:58:47.8266839Z         quant_model = mi.quant_model
2024-04-17T23:58:47.8266982Z         print(f"quant_model:\n{quant_model}")
2024-04-17T23:58:47.8267067Z     
2024-04-17T23:58:47.8267392Z         topology: Topology = Topology(world_size=world_size, compute_device="cuda")
2024-04-17T23:58:47.8267560Z         mi.planner = EmbeddingShardingPlanner(
2024-04-17T23:58:47.8267671Z             topology=topology,
2024-04-17T23:58:47.8267793Z             batch_size=batch_size,
2024-04-17T23:58:47.8267942Z             enumerator=EmbeddingEnumerator(
2024-04-17T23:58:47.8268059Z                 topology=topology,
2024-04-17T23:58:47.8268187Z                 batch_size=batch_size,
2024-04-17T23:58:47.8268290Z                 estimator=[
2024-04-17T23:58:47.8268571Z                     EmbeddingPerfEstimator(topology=topology, is_inference=True),
2024-04-17T23:58:47.8268793Z                     EmbeddingStorageEstimator(topology=topology),
2024-04-17T23:58:47.8268884Z                 ],
2024-04-17T23:58:47.8268976Z             ),
2024-04-17T23:58:47.8269076Z         )
2024-04-17T23:58:47.8269396Z         sharder = QuantFeatureProcessedEmbeddingBagCollectionSharder()
2024-04-17T23:58:47.8269557Z         # pyre-ignore
2024-04-17T23:58:47.8269677Z         plan = mi.planner.plan(
2024-04-17T23:58:47.8269780Z             mi.quant_model,
2024-04-17T23:58:47.8269888Z             [sharder],
2024-04-17T23:58:47.8269976Z         )
2024-04-17T23:58:47.8270061Z     
2024-04-17T23:58:47.8270206Z >       sharded_model = _shard_modules(
2024-04-17T23:58:47.8270314Z             module=quant_model,
2024-04-17T23:58:47.8270452Z             # pyre-ignore
2024-04-17T23:58:47.8270575Z             sharders=[sharder],
2024-04-17T23:58:47.8270925Z             # shard on meta to simulate device movement from cpu -> meta QFPEBC
2024-04-17T23:58:47.8271057Z             device=torch.device("meta"),
2024-04-17T23:58:47.8271170Z             plan=plan,
2024-04-17T23:58:47.8271303Z             # pyre-ignore
2024-04-17T23:58:47.8271610Z             env=ShardingEnv.from_local(world_size=mi.topology.world_size, rank=0),
2024-04-17T23:58:47.8271709Z         )
2024-04-17T23:58:47.8271715Z 
2024-04-17T23:58:47.8271947Z torchrec/distributed/tests/test_infer_shardings.py:1990: 
2024-04-17T23:58:47.8272194Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
2024-04-17T23:58:47.8272389Z torchrec/distributed/shard.py:282: in _shard_modules
2024-04-17T23:58:47.8272491Z     _replace(module)
2024-04-17T23:58:47.8272727Z torchrec/distributed/shard.py:280: in _replace
2024-04-17T23:58:47.8272849Z     _replace(child, child_path)
2024-04-17T23:58:47.8273013Z torchrec/distributed/shard.py:272: in _replace
2024-04-17T23:58:47.8273207Z     sharded_module = sharder_map[type(child)].shard(
2024-04-17T23:58:47.8273423Z torchrec/distributed/quant_embeddingbag.py:479: in shard
2024-04-17T23:58:47.8273722Z     return ShardedQuantFeatureProcessedEmbeddingBagCollection(
2024-04-17T23:58:47.8273965Z torchrec/distributed/quant_embeddingbag.py:412: in __init__
2024-04-17T23:58:47.8274072Z     else copy_to_device(
2024-04-17T23:58:47.8274265Z torchrec/distributed/utils.py:228: in copy_to_device
2024-04-17T23:58:47.8274415Z     with sharded_model_copy(device=None):
2024-04-17T23:58:47.8274647Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
2024-04-17T23:58:47.8274652Z 
2024-04-17T23:58:47.8275021Z self = <torchrec.distributed.utils.sharded_model_copy object at 0x7fe46c8aad10>
2024-04-17T23:58:47.8275026Z 
2024-04-17T23:58:47.8275185Z     def __enter__(self) -> None:
2024-04-17T23:58:47.8275385Z         # pyre-ignore [16]
2024-04-17T23:58:47.8275577Z         self.t_copy_save_ = torch.Tensor.__deepcopy__
2024-04-17T23:58:47.8275717Z         # pyre-ignore [16]
2024-04-17T23:58:47.8275922Z         self.p_copy_save_ = torch.nn.Parameter.__deepcopy__
2024-04-17T23:58:47.8276020Z     
2024-04-17T23:58:47.8276128Z         device = self.device
2024-04-17T23:58:47.8276213Z     
2024-04-17T23:58:47.8276381Z         # pyre-ignore [2, 3, 53]
2024-04-17T23:58:47.8276508Z         def _tensor_copy(tensor, memo):
2024-04-17T23:58:47.8276633Z             if tensor.device != device:
2024-04-17T23:58:47.8276901Z                 return tensor.detach().to(device)
2024-04-17T23:58:47.8276994Z             else:
2024-04-17T23:58:47.8277135Z                 return tensor.detach().clone()
2024-04-17T23:58:47.8277232Z     
2024-04-17T23:58:47.8277378Z         # pyre-ignore [2, 3]
2024-04-17T23:58:47.8277497Z         def _no_copy(obj, memo):
2024-04-17T23:58:47.8277607Z             return obj
2024-04-17T23:58:47.8277695Z     
2024-04-17T23:58:47.8277981Z         _copy_or_not = _tensor_copy if self.device is not None else _no_copy
2024-04-17T23:58:47.8278067Z     
2024-04-17T23:58:47.8278213Z         # pyre-ignore [2, 3, 53]
2024-04-17T23:58:47.8278353Z         def _param_copy(param, memo):
2024-04-17T23:58:47.8278483Z             return torch.nn.Parameter(
2024-04-17T23:58:47.8278725Z                 _copy_or_not(param, memo), requires_grad=param.requires_grad
2024-04-17T23:58:47.8278829Z             )
2024-04-17T23:58:47.8278912Z     
2024-04-17T23:58:47.8279069Z         torch.Tensor.__deepcopy__ = _copy_or_not
2024-04-17T23:58:47.8279265Z         torch.nn.Parameter.__deepcopy__ = _param_copy
2024-04-17T23:58:47.8279540Z         # pyre-fixme[16]: `Type` has no attribute `__deepcopy__`.
2024-04-17T23:58:47.8279822Z >       torch._C._distributed_c10d.ProcessGroupNCCL.__deepcopy__ = _no_copy
2024-04-17T23:58:47.8280309Z E       AttributeError: module 'torch._C._distributed_c10d' has no attribute 'ProcessGroupNCCL'
```

Differential Revision: D56453015
  • Loading branch information
henrylhtsang authored and facebook-github-bot committed Apr 23, 2024
1 parent 19565ec commit 749c81c
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 0 deletions.
5 changes: 5 additions & 0 deletions torchrec/distributed/tests/test_infer_shardings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1877,6 +1877,11 @@ def test_sharded_quant_fp_ebc_tw(self, weight_dtype: torch.dtype) -> None:
gm_script_output = gm_script(*inputs[0])
assert_close(sharded_output, gm_script_output)

# pyre-ignore
@unittest.skipIf(
torch.cuda.device_count() <= 1,
"Not enough GPUs available",
)
def test_sharded_quant_fp_ebc_tw_meta(self) -> None:
# Simulate inference, take unsharded cpu model and shard on meta
# Use PositionWeightedModuleCollection, FP used in production
Expand Down
5 changes: 5 additions & 0 deletions torchrec/distributed/tests/test_pt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,11 @@ def test_sharded_quant_ebc_non_strict_export(self) -> None:
# TODO: Fix Unflatten
# torch.export.unflatten(ep)

# pyre-ignore
@unittest.skipIf(
torch.cuda.device_count() <= 1,
"Not enough GPUs available",
)
def test_sharded_quant_fpebc_non_strict_export(self) -> None:
sharded_model, input_kjts = _sharded_quant_ebc_model(
local_device="cpu", compute_device="cpu", feature_processor=True
Expand Down
7 changes: 7 additions & 0 deletions torchrec/modules/tests/test_feature_processor_.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def test_populate_weights(self) -> None:
weighted_features.lengths(), weighted_features_gm_script.lengths()
)

# TODO: this test is not being run
# pyre-ignore
@unittest.skipIf(
torch.cuda.device_count() <= 0,
Expand Down Expand Up @@ -132,6 +133,7 @@ def test_populate_weights(self) -> None:
empty_fp_kjt.length_per_key(), empty_fp_kjt_gm_script.length_per_key()
)

# TODO: this test is not being run
# pyre-ignore
@unittest.skipIf(
torch.cuda.device_count() <= 0,
Expand All @@ -151,6 +153,11 @@ def test_rematerialize_from_meta(self) -> None:
self.assertTrue(pwmc.position_weights_dict[key] is param)
torch.testing.assert_close(param, torch.ones_like(param))

# pyre-ignore
@unittest.skipIf(
torch.cuda.device_count() <= 0,
"Not enough GPUs available",
)
def test_copy(self) -> None:
pwmc = PositionWeightedModuleCollection(
max_feature_lengths={"f1": 10, "f2": 10},
Expand Down

0 comments on commit 749c81c

Please sign in to comment.