From 71495968d51720a79dd1487be04b7ffea5d51ee1 Mon Sep 17 00:00:00 2001
From: Heungsub Hans Lee <heungsub.lee@kakaobrain.com>
Date: Fri, 29 Nov 2019 18:53:30 +0900
Subject: [PATCH] Proofread docs

---
 docs/api.rst                   |  8 ++++++++
 docs/guide.rst                 | 17 +++++++++--------
 torchgpipe/balance/__init__.py |  8 ++++----
 torchgpipe/gpipe.py            |  2 +-
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index 732b435..c77d28d 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -4,6 +4,8 @@ API
 GPipe Module
 ~~~~~~~~~~~~
 
+.. py:module:: torchgpipe
+
 .. autoclass:: torchgpipe.GPipe(module, balance, \**kwargs)
 
    .. automethod:: forward(input)
@@ -23,6 +25,8 @@ GPipe Module
 Skip Connections
 ~~~~~~~~~~~~~~~~
 
+.. py:module:: torchgpipe.skip
+
 .. autodecorator:: torchgpipe.skip.skippable([stash], [pop])
 
    .. automethod:: torchgpipe.skip.skippable.Skippable.isolate(ns, [only=names])
@@ -42,9 +46,13 @@ Inspecting GPipe Timeline
 
 .. autofunction:: torchgpipe.is_recomputing()
 
+.. _torchgpipe.balance:
+
 Automatic Balancing
 ~~~~~~~~~~~~~~~~~~~
 
+.. py:module:: torchgpipe.balance
+
 .. autofunction:: torchgpipe.balance.balance_by_time(partitions, module, sample, timeout=1.0, device=torch.device('cuda'))
 
 .. autofunction:: torchgpipe.balance.balance_by_size(partitions, module, input, chunks=1, param_scale=2.0, device=torch.device('cuda'))
diff --git a/docs/guide.rst b/docs/guide.rst
index 746827b..db1183b 100644
--- a/docs/guide.rst
+++ b/docs/guide.rst
@@ -179,13 +179,14 @@ Checkpointing drastically helps to reduce memory usage, but the overall
 training would slow down by about 25%. You can handle how to apply
 checkpointing on your model. There are three options:
 
-- ``always`` -- Apply checkpointing over all micro-batches.
-- ``except_last`` (default) -- Apply checkpointing except the last micro-batch.
-- ``never`` -- Checkpointing is never applied.
+- ``'always'`` -- Apply checkpointing over all micro-batches.
+- ``'except_last'`` (default) -- Apply checkpointing except the last
+  micro-batch.
+- ``'never'`` -- Checkpointing is never applied.
 
 Usually, checkpointing at the last micro-batch may not be useful because the
 saved memory will be reconstructed immediately. That's why we choose
-``except_last`` as the default option.
+``'except_last'`` as the default option.
 
 If you decide not to use checkpointing at all, :class:`nn.DataParallel
 <torch.nn.DataParallel>` might be more efficient than GPipe.
@@ -240,7 +241,7 @@ Sequential:
    a :class:`nn.Sequential <torch.nn.Sequential>` model.
 
    .. _the sequential ResNet example:
-      https://github.com/kakaobrain/torchgpipe/tree/master/examples/resnet
+      https://github.com/kakaobrain/torchgpipe/tree/master/benchmarks/models/resnet
 
    :class:`nn.Sequential <torch.nn.Sequential>` assumes that every underlying
    layer takes only one argument. Calling ``forward(x)`` on
@@ -258,7 +259,7 @@ Sequential:
 Tensor or Tensors:
    As we discussed above, each layer must take only one argument due to
    :class:`nn.Sequential <torch.nn.Sequential>`. There is one more restriction.
-   Every underlying layers' input and output must be ``Tensor`` or
+   Every underlying layers' input and output must be :class:`~torch.Tensor` or
    ``Tuple[Tensor, ...]``::
 
       # OK
@@ -435,8 +436,8 @@ multiple skip tensors. However, there are restrictions:
 
 Then, how can we instantiate multiple skippable modules from the same class in
 a sequential module? You can isolate some skip names into a
-:class:`~torch.skip.Namespace`. For example, a conceptual U-Net can be designed
-like this. There are 3 pairs of ``Encoder`` and ``Decoder``::
+:class:`~torchgpipe.skip.Namespace`. For example, a conceptual U-Net can be
+designed like this. There are 3 pairs of ``Encoder`` and ``Decoder``::
 
    # 1F. Encoder -------- Decoder -- Segment
    #        \                /
diff --git a/torchgpipe/balance/__init__.py b/torchgpipe/balance/__init__.py
index 2815e8d..5b2a667 100644
--- a/torchgpipe/balance/__init__.py
+++ b/torchgpipe/balance/__init__.py
@@ -66,8 +66,8 @@ def balance_by_time(partitions: int,
             current CUDA device)
 
     Returns:
-        A list of number of layers in each partition. Use it for the
-        ``balance`` parameter of :class:`~torchgpipe.GPipe`.
+        A list of number of layers in each partition. Use it for the `balance`
+        parameter of :class:`~torchgpipe.GPipe`.
 
     .. note::
         `module` and `sample` must be placed on the same device.
@@ -145,8 +145,8 @@ def balance_by_size(partitions: int,
             device)
 
     Returns:
-        A list of number of layers in each partition. Use it for the
-        ``balance`` parameter of :class:`~torchgpipe.GPipe`.
+        A list of number of layers in each partition. Use it for the `balance`
+        parameter of :class:`~torchgpipe.GPipe`.
 
     .. note::
         `module` and `input` must be placed on the same CUDA device.
diff --git a/torchgpipe/gpipe.py b/torchgpipe/gpipe.py
index 13bd7ce..02c5660 100644
--- a/torchgpipe/gpipe.py
+++ b/torchgpipe/gpipe.py
@@ -205,7 +205,7 @@ class GPipe(Module):
     chunks: int = 1
 
     #: The checkpoint mode to determine when to enable checkpointing. It is one
-    #: of ``always``, ``except_last``, or ``never``.
+    #: of ``'always'``, ``'except_last'``, or ``'never'``.
     checkpoint: str = 'except_last'
 
     def __init__(self,