From 71495968d51720a79dd1487be04b7ffea5d51ee1 Mon Sep 17 00:00:00 2001 From: Heungsub Hans Lee Date: Fri, 29 Nov 2019 18:53:30 +0900 Subject: [PATCH] Proofread docs --- docs/api.rst | 8 ++++++++ docs/guide.rst | 17 +++++++++-------- torchgpipe/balance/__init__.py | 8 ++++---- torchgpipe/gpipe.py | 2 +- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 732b435..c77d28d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -4,6 +4,8 @@ API GPipe Module ~~~~~~~~~~~~ +.. py:module:: torchgpipe + .. autoclass:: torchgpipe.GPipe(module, balance, \**kwargs) .. automethod:: forward(input) @@ -23,6 +25,8 @@ GPipe Module Skip Connections ~~~~~~~~~~~~~~~~ +.. py:module:: torchgpipe.skip + .. autodecorator:: torchgpipe.skip.skippable([stash], [pop]) .. automethod:: torchgpipe.skip.skippable.Skippable.isolate(ns, [only=names]) @@ -42,9 +46,13 @@ Inspecting GPipe Timeline .. autofunction:: torchgpipe.is_recomputing() +.. _torchgpipe.balance: + Automatic Balancing ~~~~~~~~~~~~~~~~~~~ +.. py:module:: torchgpipe.balance + .. autofunction:: torchgpipe.balance.balance_by_time(partitions, module, sample, timeout=1.0, device=torch.device('cuda')) .. autofunction:: torchgpipe.balance.balance_by_size(partitions, module, input, chunks=1, param_scale=2.0, device=torch.device('cuda')) diff --git a/docs/guide.rst b/docs/guide.rst index 746827b..db1183b 100644 --- a/docs/guide.rst +++ b/docs/guide.rst @@ -179,13 +179,14 @@ Checkpointing drastically helps to reduce memory usage, but the overall training would slow down by about 25%. You can handle how to apply checkpointing on your model. There are three options: -- ``always`` -- Apply checkpointing over all micro-batches. -- ``except_last`` (default) -- Apply checkpointing except the last micro-batch. -- ``never`` -- Checkpointing is never applied. +- ``'always'`` -- Apply checkpointing over all micro-batches. +- ``'except_last'`` (default) -- Apply checkpointing except the last + micro-batch. +- ``'never'`` -- Checkpointing is never applied. Usually, checkpointing at the last micro-batch may not be useful because the saved memory will be reconstructed immediately. That's why we choose -``except_last`` as the default option. +``'except_last'`` as the default option. If you decide not to use checkpointing at all, :class:`nn.DataParallel ` might be more efficient than GPipe. @@ -240,7 +241,7 @@ Sequential: a :class:`nn.Sequential ` model. .. _the sequential ResNet example: - https://github.com/kakaobrain/torchgpipe/tree/master/examples/resnet + https://github.com/kakaobrain/torchgpipe/tree/master/benchmarks/models/resnet :class:`nn.Sequential ` assumes that every underlying layer takes only one argument. Calling ``forward(x)`` on @@ -258,7 +259,7 @@ Sequential: Tensor or Tensors: As we discussed above, each layer must take only one argument due to :class:`nn.Sequential `. There is one more restriction. - Every underlying layers' input and output must be ``Tensor`` or + Every underlying layers' input and output must be :class:`~torch.Tensor` or ``Tuple[Tensor, ...]``:: # OK @@ -435,8 +436,8 @@ multiple skip tensors. However, there are restrictions: Then, how can we instantiate multiple skippable modules from the same class in a sequential module? You can isolate some skip names into a -:class:`~torch.skip.Namespace`. For example, a conceptual U-Net can be designed -like this. There are 3 pairs of ``Encoder`` and ``Decoder``:: +:class:`~torchgpipe.skip.Namespace`. For example, a conceptual U-Net can be +designed like this. There are 3 pairs of ``Encoder`` and ``Decoder``:: # 1F. Encoder -------- Decoder -- Segment # \ / diff --git a/torchgpipe/balance/__init__.py b/torchgpipe/balance/__init__.py index 2815e8d..5b2a667 100644 --- a/torchgpipe/balance/__init__.py +++ b/torchgpipe/balance/__init__.py @@ -66,8 +66,8 @@ def balance_by_time(partitions: int, current CUDA device) Returns: - A list of number of layers in each partition. Use it for the - ``balance`` parameter of :class:`~torchgpipe.GPipe`. + A list of number of layers in each partition. Use it for the `balance` + parameter of :class:`~torchgpipe.GPipe`. .. note:: `module` and `sample` must be placed on the same device. @@ -145,8 +145,8 @@ def balance_by_size(partitions: int, device) Returns: - A list of number of layers in each partition. Use it for the - ``balance`` parameter of :class:`~torchgpipe.GPipe`. + A list of number of layers in each partition. Use it for the `balance` + parameter of :class:`~torchgpipe.GPipe`. .. note:: `module` and `input` must be placed on the same CUDA device. diff --git a/torchgpipe/gpipe.py b/torchgpipe/gpipe.py index 13bd7ce..02c5660 100644 --- a/torchgpipe/gpipe.py +++ b/torchgpipe/gpipe.py @@ -205,7 +205,7 @@ class GPipe(Module): chunks: int = 1 #: The checkpoint mode to determine when to enable checkpointing. It is one - #: of ``always``, ``except_last``, or ``never``. + #: of ``'always'``, ``'except_last'``, or ``'never'``. checkpoint: str = 'except_last' def __init__(self,