Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support pp-sharding reshard #9153

Open
wants to merge 1 commit into
base: incubate/paddlenlp-fleety
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions paddlenlp/trainer/utils/reshard/pp_reshard.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.


from collections import OrderedDict

from paddle.distributed.fleet.model import PipelineParallel
Expand Down Expand Up @@ -46,6 +45,25 @@
return _GLOBAL_INDEX_LAYER_FUNC


_GLOBAL_SNAME_TO_TNAME_FUNC = None


def register_sname_to_tname_func(func):
global _GLOBAL_SNAME_TO_TNAME_FUNC
_GLOBAL_SNAME_TO_TNAME_FUNC = func

Check warning on line 53 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L53

Added line #L53 was not covered by tests


def has_register_sname_to_tname_func():
global _GLOBAL_SNAME_TO_TNAME_FUNC
return _GLOBAL_SNAME_TO_TNAME_FUNC is not None

Check warning on line 58 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L58

Added line #L58 was not covered by tests


def get_sname_to_tname_func():
global _GLOBAL_SNAME_TO_TNAME_FUNC
assert _GLOBAL_SNAME_TO_TNAME_FUNC is not None, "sname to tname func is not registered yet"
return _GLOBAL_SNAME_TO_TNAME_FUNC

Check warning on line 64 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L63-L64

Added lines #L63 - L64 were not covered by tests


class LayerNameScope:
"""
layer name scope for a layer, layer name of the same kind of layer will be named consecutively
Expand Down Expand Up @@ -206,6 +224,7 @@
self._segments = OrderedDict()
self._layer_to_segment = OrderedDict()
self._param_to_tname = OrderedDict()
self._wname_to_rname = OrderedDict()

Check warning on line 227 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L227

Added line #L227 was not covered by tests

def add_segment(self, start_index, end_index):
segment = PipeLineSegment(start_index, end_index)
Expand All @@ -218,19 +237,24 @@
segment = self._layer_to_segment[layer_index]
segment.add_layer(layer_name, param_names)

def build_name_mapping(self):
def build_name_mapping(self, sname_to_tname=None):
for (k, segment) in self._segments.items():
for (i, layer) in segment.layers.items():
for param in layer.params.items():
(param_name, tensor_name) = param
# map to a new name
n_name = self._rename_mgr.get_new_param_name(layer.name, tensor_name)
if sname_to_tname is not None:
if param_name in sname_to_tname.keys():
self._wname_to_rname[param_name] = sname_to_tname[param_name]

Check warning on line 249 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L247-L249

Added lines #L247 - L249 were not covered by tests
# logger.info(f"{param_name} {tensor_name}=>{n_name}")
self._param_to_tname[param_name] = (tensor_name, n_name)

def map_name(self, param_name, t_name):
assert param_name in self._param_to_tname
tensor_name, n_name = self._param_to_tname[param_name]
if param_name in self._wname_to_rname:
n_name = self._wname_to_rname[param_name]

Check warning on line 257 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L256-L257

Added lines #L256 - L257 were not covered by tests
assert tensor_name == t_name
return n_name

Expand Down Expand Up @@ -261,6 +285,11 @@
self._index_layers()

stage_segments = self._segment()
if has_register_sname_to_tname_func():
self._sname_to_tname = get_sname_to_tname_func()(pp_model)

Check warning on line 289 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L288-L289

Added lines #L288 - L289 were not covered by tests
else:
self._sname_to_tname = None

Check warning on line 291 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L291

Added line #L291 was not covered by tests

for (i, stage_seg) in enumerate(stage_segments):
pipe_stage = PipeLineStage()
self._stages.append(pipe_stage)
Expand All @@ -275,7 +304,7 @@
self._layer_name_to_stage[layer_name] = i

for stage in self._stages:
stage.build_name_mapping()
stage.build_name_mapping(self._sname_to_tname)

Check warning on line 307 in paddlenlp/trainer/utils/reshard/pp_reshard.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/utils/reshard/pp_reshard.py#L307

Added line #L307 was not covered by tests

def _index_layers(self):
for layer_name in self._param_names_by_layer.keys():
Expand Down
Loading