Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Disable certain pt2 tests for oss ci (pytorch#1917)
Summary: Errors are like this ``` 2024-04-17T23:58:47.8254890Z self = <torchrec.distributed.tests.test_infer_shardings.InferShardingsTest testMethod=test_sharded_quant_fp_ebc_tw_meta> 2024-04-17T23:58:47.8254896Z 2024-04-17T23:58:47.8255166Z def test_sharded_quant_fp_ebc_tw_meta(self) -> None: 2024-04-17T23:58:47.8255446Z # Simulate inference, take unsharded cpu model and shard on meta 2024-04-17T23:58:47.8255727Z # Use PositionWeightedModuleCollection, FP used in production 2024-04-17T23:58:47.8255813Z 2024-04-17T23:58:47.8255935Z num_embeddings = 10 2024-04-17T23:58:47.8256032Z emb_dim = 16 2024-04-17T23:58:47.8256130Z world_size = 2 2024-04-17T23:58:47.8256241Z batch_size = 2 2024-04-17T23:58:47.8256381Z local_device = torch.device("cpu") 2024-04-17T23:58:47.8256466Z 2024-04-17T23:58:47.8256789Z topology: Topology = Topology(world_size=world_size, compute_device="cpu") 2024-04-17T23:58:47.8256906Z mi = TestModelInfo( 2024-04-17T23:58:47.8257040Z dense_device=local_device, 2024-04-17T23:58:47.8257220Z sparse_device=local_device, 2024-04-17T23:58:47.8257324Z num_features=2, 2024-04-17T23:58:47.8257450Z num_float_features=10, 2024-04-17T23:58:47.8257568Z num_weighted_features=0, 2024-04-17T23:58:47.8257678Z topology=topology, 2024-04-17T23:58:47.8257777Z ) 2024-04-17T23:58:47.8257947Z mi.planner = EmbeddingShardingPlanner( 2024-04-17T23:58:47.8258055Z topology=topology, 2024-04-17T23:58:47.8258180Z batch_size=batch_size, 2024-04-17T23:58:47.8258324Z enumerator=EmbeddingEnumerator( 2024-04-17T23:58:47.8258435Z topology=topology, 2024-04-17T23:58:47.8258565Z batch_size=batch_size, 2024-04-17T23:58:47.8258670Z estimator=[ 2024-04-17T23:58:47.8258953Z EmbeddingPerfEstimator(topology=topology, is_inference=True), 2024-04-17T23:58:47.8259182Z EmbeddingStorageEstimator(topology=topology), 2024-04-17T23:58:47.8259280Z ], 2024-04-17T23:58:47.8259369Z ), 2024-04-17T23:58:47.8259471Z ) 2024-04-17T23:58:47.8259557Z 2024-04-17T23:58:47.8259659Z mi.tables = [ 2024-04-17T23:58:47.8259790Z EmbeddingBagConfig( 2024-04-17T23:58:47.8259933Z num_embeddings=num_embeddings, 2024-04-17T23:58:47.8260068Z embedding_dim=emb_dim, 2024-04-17T23:58:47.8260178Z name=f"table_{i}", 2024-04-17T23:58:47.8260322Z feature_names=[f"feature_{i}"], 2024-04-17T23:58:47.8260424Z ) 2024-04-17T23:58:47.8260561Z for i in range(mi.num_features) 2024-04-17T23:58:47.8260649Z ] 2024-04-17T23:58:47.8260748Z 2024-04-17T23:58:47.8260903Z max_feature_lengths = {"feature_0": 20} 2024-04-17T23:58:47.8260988Z 2024-04-17T23:58:47.8261235Z mi.model = KJTInputWrapper( 2024-04-17T23:58:47.8261396Z module_kjt_input=torch.nn.Sequential( 2024-04-17T23:58:47.8261606Z FeatureProcessedEmbeddingBagCollection( 2024-04-17T23:58:47.8261757Z EmbeddingBagCollection( 2024-04-17T23:58:47.8261876Z tables=mi.tables, 2024-04-17T23:58:47.8261995Z is_weighted=True, 2024-04-17T23:58:47.8262148Z device=mi.sparse_device, 2024-04-17T23:58:47.8262243Z ), 2024-04-17T23:58:47.8262433Z PositionWeightedModuleCollection( 2024-04-17T23:58:47.8262624Z max_feature_lengths=max_feature_lengths, 2024-04-17T23:58:47.8262764Z device=mi.sparse_device, 2024-04-17T23:58:47.8262871Z ), 2024-04-17T23:58:47.8262961Z ) 2024-04-17T23:58:47.8263052Z ) 2024-04-17T23:58:47.8263155Z ) 2024-04-17T23:58:47.8263438Z model_inputs: List[ModelInput] = prep_inputs( 2024-04-17T23:58:47.8263659Z mi, world_size, batch_size, long_indices=False, count=1 2024-04-17T23:58:47.8263766Z ) 2024-04-17T23:58:47.8263864Z inputs = [] 2024-04-17T23:58:47.8264009Z kjt = model_inputs[0].idlist_features 2024-04-17T23:58:47.8264140Z kjt = kjt.to(local_device) 2024-04-17T23:58:47.8264254Z weights = torch.rand( 2024-04-17T23:58:47.8264495Z kjt._values.size(0), dtype=torch.float, device=local_device 2024-04-17T23:58:47.8264599Z ) 2024-04-17T23:58:47.8264684Z 2024-04-17T23:58:47.8264780Z inputs = [ 2024-04-17T23:58:47.8264888Z kjt._keys, 2024-04-17T23:58:47.8264986Z kjt._values, 2024-04-17T23:58:47.8265079Z weights, 2024-04-17T23:58:47.8265189Z kjt._lengths, 2024-04-17T23:58:47.8265286Z kjt._offsets, 2024-04-17T23:58:47.8265375Z ] 2024-04-17T23:58:47.8265472Z 2024-04-17T23:58:47.8265575Z mi.model(*inputs) 2024-04-17T23:58:47.8265700Z print(f"model:\n{mi.model}") 2024-04-17T23:58:47.8265858Z 2024-04-17T23:58:47.8265993Z mi.quant_model = quantize_fpebc( 2024-04-17T23:58:47.8266103Z module=mi.model, 2024-04-17T23:58:47.8266219Z inplace=False, 2024-04-17T23:58:47.8266329Z register_tbes=True, 2024-04-17T23:58:47.8266504Z quant_state_dict_split_scale_bias=False, 2024-04-17T23:58:47.8266621Z weight_dtype=torch.int8, 2024-04-17T23:58:47.8266708Z ) 2024-04-17T23:58:47.8266839Z quant_model = mi.quant_model 2024-04-17T23:58:47.8266982Z print(f"quant_model:\n{quant_model}") 2024-04-17T23:58:47.8267067Z 2024-04-17T23:58:47.8267392Z topology: Topology = Topology(world_size=world_size, compute_device="cuda") 2024-04-17T23:58:47.8267560Z mi.planner = EmbeddingShardingPlanner( 2024-04-17T23:58:47.8267671Z topology=topology, 2024-04-17T23:58:47.8267793Z batch_size=batch_size, 2024-04-17T23:58:47.8267942Z enumerator=EmbeddingEnumerator( 2024-04-17T23:58:47.8268059Z topology=topology, 2024-04-17T23:58:47.8268187Z batch_size=batch_size, 2024-04-17T23:58:47.8268290Z estimator=[ 2024-04-17T23:58:47.8268571Z EmbeddingPerfEstimator(topology=topology, is_inference=True), 2024-04-17T23:58:47.8268793Z EmbeddingStorageEstimator(topology=topology), 2024-04-17T23:58:47.8268884Z ], 2024-04-17T23:58:47.8268976Z ), 2024-04-17T23:58:47.8269076Z ) 2024-04-17T23:58:47.8269396Z sharder = QuantFeatureProcessedEmbeddingBagCollectionSharder() 2024-04-17T23:58:47.8269557Z # pyre-ignore 2024-04-17T23:58:47.8269677Z plan = mi.planner.plan( 2024-04-17T23:58:47.8269780Z mi.quant_model, 2024-04-17T23:58:47.8269888Z [sharder], 2024-04-17T23:58:47.8269976Z ) 2024-04-17T23:58:47.8270061Z 2024-04-17T23:58:47.8270206Z > sharded_model = _shard_modules( 2024-04-17T23:58:47.8270314Z module=quant_model, 2024-04-17T23:58:47.8270452Z # pyre-ignore 2024-04-17T23:58:47.8270575Z sharders=[sharder], 2024-04-17T23:58:47.8270925Z # shard on meta to simulate device movement from cpu -> meta QFPEBC 2024-04-17T23:58:47.8271057Z device=torch.device("meta"), 2024-04-17T23:58:47.8271170Z plan=plan, 2024-04-17T23:58:47.8271303Z # pyre-ignore 2024-04-17T23:58:47.8271610Z env=ShardingEnv.from_local(world_size=mi.topology.world_size, rank=0), 2024-04-17T23:58:47.8271709Z ) 2024-04-17T23:58:47.8271715Z 2024-04-17T23:58:47.8271947Z torchrec/distributed/tests/test_infer_shardings.py:1990: 2024-04-17T23:58:47.8272194Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 2024-04-17T23:58:47.8272389Z torchrec/distributed/shard.py:282: in _shard_modules 2024-04-17T23:58:47.8272491Z _replace(module) 2024-04-17T23:58:47.8272727Z torchrec/distributed/shard.py:280: in _replace 2024-04-17T23:58:47.8272849Z _replace(child, child_path) 2024-04-17T23:58:47.8273013Z torchrec/distributed/shard.py:272: in _replace 2024-04-17T23:58:47.8273207Z sharded_module = sharder_map[type(child)].shard( 2024-04-17T23:58:47.8273423Z torchrec/distributed/quant_embeddingbag.py:479: in shard 2024-04-17T23:58:47.8273722Z return ShardedQuantFeatureProcessedEmbeddingBagCollection( 2024-04-17T23:58:47.8273965Z torchrec/distributed/quant_embeddingbag.py:412: in __init__ 2024-04-17T23:58:47.8274072Z else copy_to_device( 2024-04-17T23:58:47.8274265Z torchrec/distributed/utils.py:228: in copy_to_device 2024-04-17T23:58:47.8274415Z with sharded_model_copy(device=None): 2024-04-17T23:58:47.8274647Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 2024-04-17T23:58:47.8274652Z 2024-04-17T23:58:47.8275021Z self = <torchrec.distributed.utils.sharded_model_copy object at 0x7fe46c8aad10> 2024-04-17T23:58:47.8275026Z 2024-04-17T23:58:47.8275185Z def __enter__(self) -> None: 2024-04-17T23:58:47.8275385Z # pyre-ignore [16] 2024-04-17T23:58:47.8275577Z self.t_copy_save_ = torch.Tensor.__deepcopy__ 2024-04-17T23:58:47.8275717Z # pyre-ignore [16] 2024-04-17T23:58:47.8275922Z self.p_copy_save_ = torch.nn.Parameter.__deepcopy__ 2024-04-17T23:58:47.8276020Z 2024-04-17T23:58:47.8276128Z device = self.device 2024-04-17T23:58:47.8276213Z 2024-04-17T23:58:47.8276381Z # pyre-ignore [2, 3, 53] 2024-04-17T23:58:47.8276508Z def _tensor_copy(tensor, memo): 2024-04-17T23:58:47.8276633Z if tensor.device != device: 2024-04-17T23:58:47.8276901Z return tensor.detach().to(device) 2024-04-17T23:58:47.8276994Z else: 2024-04-17T23:58:47.8277135Z return tensor.detach().clone() 2024-04-17T23:58:47.8277232Z 2024-04-17T23:58:47.8277378Z # pyre-ignore [2, 3] 2024-04-17T23:58:47.8277497Z def _no_copy(obj, memo): 2024-04-17T23:58:47.8277607Z return obj 2024-04-17T23:58:47.8277695Z 2024-04-17T23:58:47.8277981Z _copy_or_not = _tensor_copy if self.device is not None else _no_copy 2024-04-17T23:58:47.8278067Z 2024-04-17T23:58:47.8278213Z # pyre-ignore [2, 3, 53] 2024-04-17T23:58:47.8278353Z def _param_copy(param, memo): 2024-04-17T23:58:47.8278483Z return torch.nn.Parameter( 2024-04-17T23:58:47.8278725Z _copy_or_not(param, memo), requires_grad=param.requires_grad 2024-04-17T23:58:47.8278829Z ) 2024-04-17T23:58:47.8278912Z 2024-04-17T23:58:47.8279069Z torch.Tensor.__deepcopy__ = _copy_or_not 2024-04-17T23:58:47.8279265Z torch.nn.Parameter.__deepcopy__ = _param_copy 2024-04-17T23:58:47.8279540Z # pyre-fixme[16]: `Type` has no attribute `__deepcopy__`. 2024-04-17T23:58:47.8279822Z > torch._C._distributed_c10d.ProcessGroupNCCL.__deepcopy__ = _no_copy 2024-04-17T23:58:47.8280309Z E AttributeError: module 'torch._C._distributed_c10d' has no attribute 'ProcessGroupNCCL' ``` Differential Revision: D56453015
- Loading branch information