diff --git a/.idea/darts.iml b/.idea/darts.iml
new file mode 100644
index 000000000..a2e06f665
--- /dev/null
+++ b/.idea/darts.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/darts_for_nni.iml b/.idea/darts_for_nni.iml
new file mode 100644
index 000000000..bba6157d1
--- /dev/null
+++ b/.idea/darts_for_nni.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/deployment.xml b/.idea/deployment.xml
new file mode 100644
index 000000000..61cab2a3d
--- /dev/null
+++ b/.idea/deployment.xml
@@ -0,0 +1,35 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 000000000..15a15b218
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml
new file mode 100644
index 000000000..71f5ff749
--- /dev/null
+++ b/.idea/libraries/R_User_Library.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 000000000..399908725
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 000000000..0b41eb32c
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/remote-mappings.xml b/.idea/remote-mappings.xml
new file mode 100644
index 000000000..eac16fab8
--- /dev/null
+++ b/.idea/remote-mappings.xml
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 000000000..94a25f7f4
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 000000000..17931b223
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,236 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ tuner_params
+ PRIMITIVES
+ Network
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1555584030654
+
+
+ 1555584030654
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index c440a8512..50c1a7425 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,204 +1,21 @@
- Copyright (c) 2018, Hanxiao Liu.
- All rights reserved.
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
+MIT License
+
+Copyright (c) 2019 VDeamoV
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 3f711a00d..aa3b50954 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
# Differentiable Architecture Search
-Code accompanying the paper
+
> [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055)\
> Hanxiao Liu, Karen Simonyan, Yiming Yang.\
> _arXiv:1806.09055_.
diff --git a/cnn/genotypes.py b/cnn/genotypes.py
index 7849f6ad4..b9396a4b4 100644
--- a/cnn/genotypes.py
+++ b/cnn/genotypes.py
@@ -74,5 +74,5 @@
DARTS_V1 = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 0), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('avg_pool_3x3', 0)], reduce_concat=[2, 3, 4, 5])
DARTS_V2 = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5])
-DARTS = DARTS_V2
+DARTS = DARTS_V1
diff --git a/cnn/model_search.py b/cnn/model_search.py
index 751e40792..9775c7f03 100644
--- a/cnn/model_search.py
+++ b/cnn/model_search.py
@@ -105,8 +105,12 @@ def forward(self, input):
for i, cell in enumerate(self.cells):
if cell.reduction:
weights = F.softmax(self.alphas_reduce, dim=-1)
+ # w_shape = self.alpha_reduce.shape
+ # weights = F.softmax(self.alpha_reduce.view(-1), dim=0).view(*w_shape)
else:
weights = F.softmax(self.alphas_normal, dim=-1)
+ # w_shape = self.alpha_normal.shape
+ # weights = F.softmax(self.alpha_normal.view(-1), dim=0).view(*w_shape)
s0, s1 = s1, cell(s0, s1, weights)
out = self.global_pooling(s1)
logits = self.classifier(out.view(out.size(0),-1))
diff --git a/cnn/train.py b/cnn/train.py
index ab8f8f825..7841f0e37 100644
--- a/cnn/train.py
+++ b/cnn/train.py
@@ -19,12 +19,12 @@
parser = argparse.ArgumentParser("cifar")
parser.add_argument('--data', type=str, default='../data', help='location of the data corpus')
-parser.add_argument('--batch_size', type=int, default=96, help='batch size')
+parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate')
parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay')
parser.add_argument('--report_freq', type=float, default=50, help='report frequency')
-parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
+parser.add_argument('--gpu', type=int, default=1, help='gpu device id')
parser.add_argument('--epochs', type=int, default=600, help='num of training epochs')
parser.add_argument('--init_channels', type=int, default=36, help='num of init channels')
parser.add_argument('--layers', type=int, default=20, help='total number of layers')
diff --git a/cnn/train_search.py b/cnn/train_search.py
index 067875bfa..1ad5dd066 100644
--- a/cnn/train_search.py
+++ b/cnn/train_search.py
@@ -14,13 +14,14 @@
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
+# noinspection PyUnresolvedReferences
from model_search import Network
from architect import Architect
parser = argparse.ArgumentParser("cifar")
parser.add_argument('--data', type=str, default='../data', help='location of the data corpus')
-parser.add_argument('--batch_size', type=int, default=64, help='batch size')
+parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate')
parser.add_argument('--learning_rate_min', type=float, default=0.001, help='min learning rate')
parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
@@ -44,7 +45,7 @@
args = parser.parse_args()
args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
-utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))
+utils.create_exp_dir(args.save)
log_format = '%(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
@@ -74,6 +75,7 @@ def main():
criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()
model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
+ # utils.load(model, os.path.join('search-EXP-20190323-192622', 'weights.pth'))
model = model.cuda()
logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
@@ -126,7 +128,6 @@ def main():
utils.save(model, os.path.join(args.save, 'weights.pt'))
-
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr):
objs = utils.AvgrageMeter()
top1 = utils.AvgrageMeter()
@@ -155,14 +156,14 @@ def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr):
optimizer.step()
prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
- objs.update(loss.data[0], n)
- top1.update(prec1.data[0], n)
- top5.update(prec5.data[0], n)
+ objs.update(loss.item(), n)
+ top1.update(prec1.item(), n)
+ top5.update(prec5.item(), n)
if step % args.report_freq == 0:
logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)
- return top1.avg, objs.avg
+ return top5.avg, objs.avg
def infer(valid_queue, model, criterion):
@@ -172,24 +173,26 @@ def infer(valid_queue, model, criterion):
model.eval()
for step, (input, target) in enumerate(valid_queue):
- input = Variable(input, volatile=True).cuda()
- target = Variable(target, volatile=True).cuda(async=True)
+ with torch.no_grad():
+ input = Variable(input).cuda()
+ target = Variable(target).cuda(async=True)
logits = model(input)
loss = criterion(logits, target)
prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
n = input.size(0)
- objs.update(loss.data[0], n)
- top1.update(prec1.data[0], n)
- top5.update(prec5.data[0], n)
+ objs.update(loss.item(), n)
+ top1.update(prec1.item(), n)
+ top5.update(prec5.item(), n)
if step % args.report_freq == 0:
logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)
- return top1.avg, objs.avg
+ return top5.avg, objs.avg
if __name__ == '__main__':
- main()
+ main()
+
diff --git a/custom_tuner.py b/custom_tuner.py
new file mode 100644
index 000000000..4363e10b5
--- /dev/null
+++ b/custom_tuner.py
@@ -0,0 +1,56 @@
+"""
+File: custom_tuner.py
+Author: OccamRazer
+Email: vincent.duan95@outlook.com
+Github: https://github.com/VDeamoV
+Description:
+ Change Name for futher use
+"""
+from nni.tuner import Tuner
+
+
+class custom_tuner(Tuner):
+ """
+ define custom_tuner
+ """
+
+ def __init__(self, model_architecture_path, dataset_path, primitives, output_path):
+ """
+ User have to define their base architecture here and show where is the dataset,
+ these params is essential for the tuner to work
+
+ params: model_architecture_path the configure file for architecture
+ params: dataset_path the configure file for dataset_path
+ params: data_type to define use cnn or rnn
+
+
+ """
+ # TODO: We think we can custom image dataset #
+ # TODO: <18-04-19, VDeamoV> #
+ self.model_architecture_path = model_architecture_path
+ self.dataset_path = dataset_path
+ self.primitives = primitives
+ self.output_path = output_path
+
+ def update_search_space(self, search_space):
+ """
+ Must to imply
+ """
+ print(search_space)
+ print("fuck it")
+
+ def receive_trial_result(self, parameter_id, parameters, value):
+ '''
+ we maybe don't need it either
+ '''
+ pass
+
+ def generate_parameters(self, parameter_id):
+ '''
+ we maybe don't need it
+ '''
+ param = dict({"dataset_path": self.dataset_path,
+ "model_architecture_path": self.model_architecture_path,
+ "primitives": self.primitives,
+ "output_path": self.output_path})
+ return param
diff --git a/custom_tuner.yaml b/custom_tuner.yaml
new file mode 100644
index 000000000..fcd06b4d4
--- /dev/null
+++ b/custom_tuner.yaml
@@ -0,0 +1,24 @@
+authorName: Occam
+experimentName: test_darts
+trialConcurrency: 1 # we only need 1 trial we don't run more than one
+maxExecDuration: 1000h
+maxTrialNum: 1
+trainingServicePlatform: local
+useAnnotation: False
+tuner:
+ codeDir: /home/apex/DeamoV/github/darts_for_nni
+ classFileName: custom_tuner.py
+ className: custom_tuner
+ # Any parameter need to pass to your tuner class __init__ constructor
+ # can be specified in this optional classArgs field, for example
+ classArgs:
+ model_architecture_path: "path"
+ primitives: ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5']
+ dataset_path: "/home/apex/DeamoV/github/darts_for_nni/darts_source/data"
+ output_path: "/home/apex/tmp/testoutput"
+ # this is to output all the log and script
+ # data_type: "path"
+trial:
+ command: "python3 train_search.py"
+ codeDir: "/home/apex/DeamoV/github/darts_for_nni/darts_source/cnn"
+ gpuNum: 2
diff --git a/darts_source/cnn/Makefile b/darts_source/cnn/Makefile
new file mode 100644
index 000000000..5ddb9ae7e
--- /dev/null
+++ b/darts_source/cnn/Makefile
@@ -0,0 +1,2 @@
+clean:
+ rm -rf __pycache__
diff --git a/darts_source/cnn/architect.py b/darts_source/cnn/architect.py
new file mode 100644
index 000000000..981d3480e
--- /dev/null
+++ b/darts_source/cnn/architect.py
@@ -0,0 +1,96 @@
+import torch
+import numpy as np
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+def _concat(xs):
+ return torch.cat([x.view(-1) for x in xs])
+
+
+class Architect(object):
+
+ def __init__(self, model, args):
+ self.network_momentum = args.momentum
+ self.network_weight_decay = args.weight_decay
+ self.model = model
+ self.optimizer = torch.optim.Adam(self.model.arch_parameters(),
+ lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)
+
+ def _compute_unrolled_model(self, input, target, eta, network_optimizer):
+ loss = self.model._loss(input, target)
+ theta = _concat(self.model.parameters()).data
+ try:
+ moment = _concat(network_optimizer.state[v]['momentum_buffer']
+ for v in self.model.parameters()).mul_(self.network_momentum)
+ except:
+ moment = torch.zeros_like(theta)
+ dtheta = _concat(torch.autograd.grad(
+ loss, self.model.parameters())).data + self.network_weight_decay*theta
+ unrolled_model = self._construct_model_from_theta(
+ theta.sub(eta, moment+dtheta))
+ return unrolled_model
+
+ def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled):
+ self.optimizer.zero_grad()
+ if unrolled:
+ self._backward_step_unrolled(
+ input_train, target_train, input_valid, target_valid, eta, network_optimizer)
+ else:
+ self._backward_step(input_valid, target_valid)
+ self.optimizer.step()
+
+ def _backward_step(self, input_valid, target_valid):
+ loss = self.model._loss(input_valid, target_valid)
+ loss.backward()
+
+ def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer):
+ unrolled_model = self._compute_unrolled_model(
+ input_train, target_train, eta, network_optimizer)
+ unrolled_loss = unrolled_model._loss(input_valid, target_valid)
+
+ unrolled_loss.backward()
+ dalpha = [v.grad for v in unrolled_model.arch_parameters()]
+ vector = [v.grad.data for v in unrolled_model.parameters()]
+ implicit_grads = self._hessian_vector_product(vector, input_train, target_train)
+
+ for g, ig in zip(dalpha, implicit_grads):
+ g.data.sub_(eta, ig.data)
+
+ for v, g in zip(self.model.arch_parameters(), dalpha):
+ if v.grad is None:
+ v.grad = Variable(g.data)
+ else:
+ v.grad.data.copy_(g.data)
+
+ def _construct_model_from_theta(self, theta):
+ model_new = self.model.new()
+ model_dict = self.model.state_dict()
+
+ params, offset = {}, 0
+ for k, v in self.model.named_parameters():
+ v_length = np.prod(v.size())
+ params[k] = theta[offset: offset+v_length].view(v.size())
+ offset += v_length
+
+ assert offset == len(theta)
+ model_dict.update(params)
+ model_new.load_state_dict(model_dict)
+ return model_new.cuda()
+
+ def _hessian_vector_product(self, vector, input, target, r=1e-2):
+ R = r / _concat(vector).norm()
+ for p, v in zip(self.model.parameters(), vector):
+ p.data.add_(R, v)
+ loss = self.model._loss(input, target)
+ grads_p = torch.autograd.grad(loss, self.model.arch_parameters())
+
+ for p, v in zip(self.model.parameters(), vector):
+ p.data.sub_(2*R, v)
+ loss = self.model._loss(input, target)
+ grads_n = torch.autograd.grad(loss, self.model.arch_parameters())
+
+ for p, v in zip(self.model.parameters(), vector):
+ p.data.add_(R, v)
+
+ return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]
diff --git a/darts_source/cnn/genotypes.py b/darts_source/cnn/genotypes.py
new file mode 100644
index 000000000..9d235c9ec
--- /dev/null
+++ b/darts_source/cnn/genotypes.py
@@ -0,0 +1,79 @@
+from collections import namedtuple
+
+Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')
+
+PRIMITIVES = [
+ 'none',
+ 'max_pool_3x3',
+ 'avg_pool_3x3',
+ 'skip_connect',
+ 'sep_conv_3x3',
+ 'sep_conv_5x5',
+ 'dil_conv_3x3',
+ 'dil_conv_5x5'
+]
+
+NASNet = Genotype(
+ normal=[
+ ('sep_conv_5x5', 1),
+ ('sep_conv_3x3', 0),
+ ('sep_conv_5x5', 0),
+ ('sep_conv_3x3', 0),
+ ('avg_pool_3x3', 1),
+ ('skip_connect', 0),
+ ('avg_pool_3x3', 0),
+ ('avg_pool_3x3', 0),
+ ('sep_conv_3x3', 1),
+ ('skip_connect', 1),
+ ],
+ normal_concat=[2, 3, 4, 5, 6],
+ reduce=[
+ ('sep_conv_5x5', 1),
+ ('sep_conv_7x7', 0),
+ ('max_pool_3x3', 1),
+ ('sep_conv_7x7', 0),
+ ('avg_pool_3x3', 1),
+ ('sep_conv_5x5', 0),
+ ('skip_connect', 3),
+ ('avg_pool_3x3', 2),
+ ('sep_conv_3x3', 2),
+ ('max_pool_3x3', 1),
+ ],
+ reduce_concat=[4, 5, 6],
+)
+
+AmoebaNet = Genotype(
+ normal=[
+ ('avg_pool_3x3', 0),
+ ('max_pool_3x3', 1),
+ ('sep_conv_3x3', 0),
+ ('sep_conv_5x5', 2),
+ ('sep_conv_3x3', 0),
+ ('avg_pool_3x3', 3),
+ ('sep_conv_3x3', 1),
+ ('skip_connect', 1),
+ ('skip_connect', 0),
+ ('avg_pool_3x3', 1),
+ ],
+ normal_concat=[4, 5, 6],
+ reduce=[
+ ('avg_pool_3x3', 0),
+ ('sep_conv_3x3', 1),
+ ('max_pool_3x3', 0),
+ ('sep_conv_7x7', 2),
+ ('sep_conv_7x7', 0),
+ ('avg_pool_3x3', 1),
+ ('max_pool_3x3', 0),
+ ('max_pool_3x3', 1),
+ ('conv_7x1_1x7', 0),
+ ('sep_conv_3x3', 5),
+ ],
+ reduce_concat=[3, 4, 6]
+)
+
+DARTS_V1 = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 2)], normal_concat=[
+ 2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 0), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('avg_pool_3x3', 0)], reduce_concat=[2, 3, 4, 5])
+DARTS_V2 = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[
+ 2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5])
+
+DARTS = DARTS_V1
diff --git a/darts_source/cnn/model.py b/darts_source/cnn/model.py
new file mode 100644
index 000000000..145d8bfcc
--- /dev/null
+++ b/darts_source/cnn/model.py
@@ -0,0 +1,219 @@
+import torch
+import torch.nn as nn
+from operations import *
+from torch.autograd import Variable
+from utils import drop_path
+
+
+class Cell(nn.Module):
+
+ def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev):
+ super(Cell, self).__init__()
+ print(C_prev_prev, C_prev, C)
+
+ if reduction_prev:
+ self.preprocess0 = FactorizedReduce(C_prev_prev, C)
+ else:
+ self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0)
+ self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0)
+
+ if reduction:
+ op_names, indices = zip(*genotype.reduce)
+ concat = genotype.reduce_concat
+ else:
+ op_names, indices = zip(*genotype.normal)
+ concat = genotype.normal_concat
+ self._compile(C, op_names, indices, concat, reduction)
+
+ def _compile(self, C, op_names, indices, concat, reduction):
+ assert len(op_names) == len(indices)
+ self._steps = len(op_names) // 2
+ self._concat = concat
+ self.multiplier = len(concat)
+
+ self._ops = nn.ModuleList()
+ for name, index in zip(op_names, indices):
+ stride = 2 if reduction and index < 2 else 1
+ op = OPS[name](C, stride, True)
+ self._ops += [op]
+ self._indices = indices
+
+ def forward(self, s0, s1, drop_prob):
+ s0 = self.preprocess0(s0)
+ s1 = self.preprocess1(s1)
+
+ states = [s0, s1]
+ for i in range(self._steps):
+ h1 = states[self._indices[2*i]]
+ h2 = states[self._indices[2*i+1]]
+ op1 = self._ops[2*i]
+ op2 = self._ops[2*i+1]
+ h1 = op1(h1)
+ h2 = op2(h2)
+ if self.training and drop_prob > 0.:
+ if not isinstance(op1, Identity):
+ h1 = drop_path(h1, drop_prob)
+ if not isinstance(op2, Identity):
+ h2 = drop_path(h2, drop_prob)
+ s = h1 + h2
+ states += [s]
+ return torch.cat([states[i] for i in self._concat], dim=1)
+
+
+class AuxiliaryHeadCIFAR(nn.Module):
+
+ def __init__(self, C, num_classes):
+ """assuming input size 8x8"""
+ super(AuxiliaryHeadCIFAR, self).__init__()
+ self.features = nn.Sequential(
+ nn.ReLU(inplace=True),
+ # image size = 2 x 2
+ nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False),
+ nn.Conv2d(C, 128, 1, bias=False),
+ nn.BatchNorm2d(128),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(128, 768, 2, bias=False),
+ nn.BatchNorm2d(768),
+ nn.ReLU(inplace=True)
+ )
+ self.classifier = nn.Linear(768, num_classes)
+
+ def forward(self, x):
+ x = self.features(x)
+ x = self.classifier(x.view(x.size(0), -1))
+ return x
+
+
+class AuxiliaryHeadImageNet(nn.Module):
+
+ def __init__(self, C, num_classes):
+ """assuming input size 14x14"""
+ super(AuxiliaryHeadImageNet, self).__init__()
+ self.features = nn.Sequential(
+ nn.ReLU(inplace=True),
+ nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False),
+ nn.Conv2d(C, 128, 1, bias=False),
+ nn.BatchNorm2d(128),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(128, 768, 2, bias=False),
+ # NOTE: This batchnorm was omitted in my earlier implementation due to a typo.
+ # Commenting it out for consistency with the experiments in the paper.
+ # nn.BatchNorm2d(768),
+ nn.ReLU(inplace=True)
+ )
+ self.classifier = nn.Linear(768, num_classes)
+
+ def forward(self, x):
+ x = self.features(x)
+ x = self.classifier(x.view(x.size(0), -1))
+ return x
+
+
+class NetworkCIFAR(nn.Module):
+
+ def __init__(self, C, num_classes, layers, auxiliary, genotype):
+ super(NetworkCIFAR, self).__init__()
+ self._layers = layers
+ self._auxiliary = auxiliary
+
+ stem_multiplier = 3
+ C_curr = stem_multiplier*C
+ self.stem = nn.Sequential(
+ nn.Conv2d(3, C_curr, 3, padding=1, bias=False),
+ nn.BatchNorm2d(C_curr)
+ )
+
+ C_prev_prev, C_prev, C_curr = C_curr, C_curr, C
+ self.cells = nn.ModuleList()
+ reduction_prev = False
+ for i in range(layers):
+ if i in [layers//3, 2*layers//3]:
+ C_curr *= 2
+ reduction = True
+ else:
+ reduction = False
+ cell = Cell(genotype, C_prev_prev, C_prev,
+ C_curr, reduction, reduction_prev)
+ reduction_prev = reduction
+ self.cells += [cell]
+ C_prev_prev, C_prev = C_prev, cell.multiplier*C_curr
+ if i == 2*layers//3:
+ C_to_auxiliary = C_prev
+
+ if auxiliary:
+ self.auxiliary_head = AuxiliaryHeadCIFAR(
+ C_to_auxiliary, num_classes)
+ self.global_pooling = nn.AdaptiveAvgPool2d(1)
+ self.classifier = nn.Linear(C_prev, num_classes)
+
+ def forward(self, input):
+ logits_aux = None
+ s0 = s1 = self.stem(input)
+ for i, cell in enumerate(self.cells):
+ s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
+ if i == 2*self._layers//3:
+ if self._auxiliary and self.training:
+ logits_aux = self.auxiliary_head(s1)
+ out = self.global_pooling(s1)
+ logits = self.classifier(out.view(out.size(0), -1))
+ return logits, logits_aux
+
+
+class NetworkImageNet(nn.Module):
+
+ def __init__(self, C, num_classes, layers, auxiliary, genotype):
+ super(NetworkImageNet, self).__init__()
+ self._layers = layers
+ self._auxiliary = auxiliary
+
+ self.stem0 = nn.Sequential(
+ nn.Conv2d(3, C // 2, kernel_size=3,
+ stride=2, padding=1, bias=False),
+ nn.BatchNorm2d(C // 2),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False),
+ nn.BatchNorm2d(C),
+ )
+
+ self.stem1 = nn.Sequential(
+ nn.ReLU(inplace=True),
+ nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False),
+ nn.BatchNorm2d(C),
+ )
+
+ C_prev_prev, C_prev, C_curr = C, C, C
+
+ self.cells = nn.ModuleList()
+ reduction_prev = True
+ for i in range(layers):
+ if i in [layers // 3, 2 * layers // 3]:
+ C_curr *= 2
+ reduction = True
+ else:
+ reduction = False
+ cell = Cell(genotype, C_prev_prev, C_prev,
+ C_curr, reduction, reduction_prev)
+ reduction_prev = reduction
+ self.cells += [cell]
+ C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr
+ if i == 2 * layers // 3:
+ C_to_auxiliary = C_prev
+
+ if auxiliary:
+ self.auxiliary_head = AuxiliaryHeadImageNet(
+ C_to_auxiliary, num_classes)
+ self.global_pooling = nn.AvgPool2d(7)
+ self.classifier = nn.Linear(C_prev, num_classes)
+
+ def forward(self, input):
+ logits_aux = None
+ s0 = self.stem0(input)
+ s1 = self.stem1(s0)
+ for i, cell in enumerate(self.cells):
+ s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
+ if i == 2 * self._layers // 3:
+ if self._auxiliary and self.training:
+ logits_aux = self.auxiliary_head(s1)
+ out = self.global_pooling(s1)
+ logits = self.classifier(out.view(out.size(0), -1))
+ return logits, logits_aux
diff --git a/darts_source/cnn/model_parser.py b/darts_source/cnn/model_parser.py
new file mode 100644
index 000000000..0c2d750d0
--- /dev/null
+++ b/darts_source/cnn/model_parser.py
@@ -0,0 +1,62 @@
+"""
+File: parser.py
+Author: OccamRazerTeam
+Email: vincent.duan95@outlook.com
+Github: https://github.com/VDeamoV
+Description: This file is used to return the config typed in by user
+"""
+
+import argparse
+
+
+def get_cifar_parser_params():
+ """
+ Use to return the base config
+ """
+ parser = argparse.ArgumentParser(description="cifar configures")
+ parser.add_argument('--data', type=str, default='../data',
+ help='location of the data corpus')
+ parser.add_argument('--batch_size', type=int, default=32,
+ help='batch size')
+ parser.add_argument('--learning_rate', type=float, default=0.025,
+ help='init learning rate')
+ parser.add_argument('--learning_rate_min', type=float, default=0.001,
+ help='min learning rate')
+ parser.add_argument('--momentum', type=float, default=0.9,
+ help='momentum')
+ parser.add_argument('--weight_decay', type=float, default=3e-4,
+ help='weight decay')
+ parser.add_argument('--report_freq', type=float, default=50,
+ help='report frequency')
+ parser.add_argument('--gpu', type=int, default=1,
+ help='gpu device id')
+ parser.add_argument('--epochs', type=int, default=50,
+ help='num of training epochs')
+ parser.add_argument('--init_channels', type=int, default=16,
+ help='num of init channels')
+ parser.add_argument('--layers', type=int, default=8,
+ help='total number of layers')
+ parser.add_argument('--model_path', type=str, default='saved_models',
+ help='path to save the model')
+ parser.add_argument('--cutout', action='store_true', default=False,
+ help='use cutout')
+ parser.add_argument('--cutout_length', type=int, default=16,
+ help='cutout length')
+ parser.add_argument('--drop_path_prob', type=float, default=0.3,
+ help='drop path probability')
+ parser.add_argument('--save', type=str, default='EXP',
+ help='experiment name')
+ parser.add_argument('--seed', type=int, default=2,
+ help='random seed')
+ parser.add_argument('--grad_clip', type=float, default=5,
+ help='gradient clipping')
+ parser.add_argument('--train_portion', type=float, default=0.5,
+ help='portion of training data')
+ parser.add_argument('--unrolled', action='store_true', default=False,
+ help='use one-step unrolled validation loss')
+ parser.add_argument('--arch_learning_rate', type=float, default=3e-4,
+ help='learning rate for arch encoding')
+ parser.add_argument('--arch_weight_decay', type=float, default=1e-3,
+ help='weight decay for arch encoding')
+ args = parser.parse_args()
+ return args
diff --git a/darts_source/cnn/model_search.py b/darts_source/cnn/model_search.py
new file mode 100644
index 000000000..7e7671f89
--- /dev/null
+++ b/darts_source/cnn/model_search.py
@@ -0,0 +1,174 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from operations import *
+from torch.autograd import Variable
+# from genotypes import PRIMITIVES
+import nni
+from genotypes import Genotype
+
+tuner_params = nni.get_next_parameter()
+PRIMITIVES = tuner_params["primitives"]
+
+class MixedOp(nn.Module):
+
+ def __init__(self, C, stride):
+ super(MixedOp, self).__init__()
+ self._ops = nn.ModuleList()
+ for primitive in PRIMITIVES:
+ op = OPS[primitive](C, stride, False)
+ if 'pool' in primitive:
+ op = nn.Sequential(op, nn.BatchNorm2d(C, affine=False))
+ self._ops.append(op)
+
+ def forward(self, x, weights):
+ return sum(w * op(x) for w, op in zip(weights, self._ops))
+
+
+class Cell(nn.Module):
+
+ def __init__(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev):
+ super(Cell, self).__init__()
+ self.reduction = reduction
+
+ if reduction_prev:
+ self.preprocess0 = FactorizedReduce(C_prev_prev, C, affine=False)
+ else:
+ self.preprocess0 = ReLUConvBN(
+ C_prev_prev, C, 1, 1, 0, affine=False)
+ self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0, affine=False)
+ self._steps = steps
+ self._multiplier = multiplier
+
+ self._ops = nn.ModuleList()
+ self._bns = nn.ModuleList()
+ for i in range(self._steps):
+ for j in range(2+i):
+ stride = 2 if reduction and j < 2 else 1
+ op = MixedOp(C, stride)
+ self._ops.append(op)
+
+ def forward(self, s0, s1, weights):
+ s0 = self.preprocess0(s0)
+ s1 = self.preprocess1(s1)
+
+ states = [s0, s1]
+ offset = 0
+ for i in range(self._steps):
+ s = sum(self._ops[offset+j](h, weights[offset+j])
+ for j, h in enumerate(states))
+ offset += len(states)
+ states.append(s)
+
+ return torch.cat(states[-self._multiplier:], dim=1)
+
+
+class Network(nn.Module):
+
+ def __init__(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3):
+ super(Network, self).__init__()
+ self._C = C
+ self._num_classes = num_classes
+ self._layers = layers
+ self._criterion = criterion
+ self._steps = steps
+ self._multiplier = multiplier
+
+ C_curr = stem_multiplier*C
+ self.stem = nn.Sequential(
+ nn.Conv2d(3, C_curr, 3, padding=1, bias=False),
+ nn.BatchNorm2d(C_curr)
+ )
+
+ C_prev_prev, C_prev, C_curr = C_curr, C_curr, C
+ self.cells = nn.ModuleList()
+ reduction_prev = False
+ for i in range(layers):
+ if i in [layers//3, 2*layers//3]:
+ C_curr *= 2
+ reduction = True
+ else:
+ reduction = False
+ cell = Cell(steps, multiplier, C_prev_prev, C_prev,
+ C_curr, reduction, reduction_prev)
+ reduction_prev = reduction
+ self.cells += [cell]
+ C_prev_prev, C_prev = C_prev, multiplier*C_curr
+
+ self.global_pooling = nn.AdaptiveAvgPool2d(1)
+ self.classifier = nn.Linear(C_prev, num_classes)
+
+ self._initialize_alphas()
+
+ def new(self):
+ model_new = Network(self._C, self._num_classes,
+ self._layers, self._criterion).cuda()
+ for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
+ x.data.copy_(y.data)
+ return model_new
+
+ def forward(self, input):
+ s0 = s1 = self.stem(input)
+ for i, cell in enumerate(self.cells):
+ if cell.reduction:
+ weights = F.softmax(self.alphas_reduce, dim=-1)
+ else:
+ weights = F.softmax(self.alphas_normal, dim=-1)
+ s0, s1 = s1, cell(s0, s1, weights)
+ out = self.global_pooling(s1)
+ logits = self.classifier(out.view(out.size(0), -1))
+ return logits
+
+ def _loss(self, input, target):
+ logits = self(input)
+ return self._criterion(logits, target)
+
+ def _initialize_alphas(self):
+ k = sum(1 for i in range(self._steps) for n in range(2+i))
+ num_ops = len(PRIMITIVES)
+
+ self.alphas_normal = Variable(
+ 1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)
+ self.alphas_reduce = Variable(
+ 1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)
+ self._arch_parameters = [
+ self.alphas_normal,
+ self.alphas_reduce,
+ ]
+
+ def arch_parameters(self):
+ return self._arch_parameters
+
+ def genotype(self):
+
+ def _parse(weights):
+ gene = []
+ n = 2
+ start = 0
+ for i in range(self._steps):
+ end = start + n
+ W = weights[start:end].copy()
+ edges = sorted(range(i + 2), key=lambda x: -max(
+ W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2]
+ for j in edges:
+ k_best = None
+ for k in range(len(W[j])):
+ if k != PRIMITIVES.index('none'):
+ if k_best is None or W[j][k] > W[j][k_best]:
+ k_best = k
+ gene.append((PRIMITIVES[k_best], j))
+ start = end
+ n += 1
+ return gene
+
+ gene_normal = _parse(
+ F.softmax(self.alphas_normal, dim=-1).data.cpu().numpy())
+ gene_reduce = _parse(
+ F.softmax(self.alphas_reduce, dim=-1).data.cpu().numpy())
+
+ concat = range(2+self._steps-self._multiplier, self._steps+2)
+ genotype = Genotype(
+ normal=gene_normal, normal_concat=concat,
+ reduce=gene_reduce, reduce_concat=concat
+ )
+ return genotype
diff --git a/darts_source/cnn/operations.py b/darts_source/cnn/operations.py
new file mode 100644
index 000000000..412a27245
--- /dev/null
+++ b/darts_source/cnn/operations.py
@@ -0,0 +1,114 @@
+import torch
+import torch.nn as nn
+
+OPS = {
+ 'none': lambda C, stride, affine: Zero(stride),
+ 'avg_pool_3x3': lambda C, stride, affine: nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False),
+ 'max_pool_3x3': lambda C, stride, affine: nn.MaxPool2d(3, stride=stride, padding=1),
+ 'skip_connect': lambda C, stride, affine: Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
+ 'sep_conv_3x3': lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine),
+ 'sep_conv_5x5': lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine),
+ 'sep_conv_7x7': lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine),
+ 'dil_conv_3x3': lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine),
+ 'dil_conv_5x5': lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine),
+ 'conv_7x1_1x7': lambda C, stride, affine: nn.Sequential(
+ nn.ReLU(inplace=False),
+ nn.Conv2d(C, C, (1, 7), stride=(1, stride),
+ padding=(0, 3), bias=False),
+ nn.Conv2d(C, C, (7, 1), stride=(stride, 1),
+ padding=(3, 0), bias=False),
+ nn.BatchNorm2d(C, affine=affine)
+ ),
+}
+
+
+class ReLUConvBN(nn.Module):
+
+ def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+ super(ReLUConvBN, self).__init__()
+ self.op = nn.Sequential(
+ nn.ReLU(inplace=False),
+ nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
+ padding=padding, bias=False),
+ nn.BatchNorm2d(C_out, affine=affine)
+ )
+
+ def forward(self, x):
+ return self.op(x)
+
+
+class DilConv(nn.Module):
+
+ def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
+ super(DilConv, self).__init__()
+ self.op = nn.Sequential(
+ nn.ReLU(inplace=False),
+ nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride,
+ padding=padding, dilation=dilation, groups=C_in, bias=False),
+ nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+ nn.BatchNorm2d(C_out, affine=affine),
+ )
+
+ def forward(self, x):
+ return self.op(x)
+
+
+class SepConv(nn.Module):
+
+ def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+ super(SepConv, self).__init__()
+ self.op = nn.Sequential(
+ nn.ReLU(inplace=False),
+ nn.Conv2d(C_in, C_in, kernel_size=kernel_size,
+ stride=stride, padding=padding, groups=C_in, bias=False),
+ nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
+ nn.BatchNorm2d(C_in, affine=affine),
+ nn.ReLU(inplace=False),
+ nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1,
+ padding=padding, groups=C_in, bias=False),
+ nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+ nn.BatchNorm2d(C_out, affine=affine),
+ )
+
+ def forward(self, x):
+ return self.op(x)
+
+
+class Identity(nn.Module):
+
+ def __init__(self):
+ super(Identity, self).__init__()
+
+ def forward(self, x):
+ return x
+
+
+class Zero(nn.Module):
+
+ def __init__(self, stride):
+ super(Zero, self).__init__()
+ self.stride = stride
+
+ def forward(self, x):
+ if self.stride == 1:
+ return x.mul(0.)
+ return x[:, :, ::self.stride, ::self.stride].mul(0.)
+
+
+class FactorizedReduce(nn.Module):
+
+ def __init__(self, C_in, C_out, affine=True):
+ super(FactorizedReduce, self).__init__()
+ assert C_out % 2 == 0
+ self.relu = nn.ReLU(inplace=False)
+ self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1,
+ stride=2, padding=0, bias=False)
+ self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1,
+ stride=2, padding=0, bias=False)
+ self.bn = nn.BatchNorm2d(C_out, affine=affine)
+
+ def forward(self, x):
+ x = self.relu(x)
+ out = torch.cat([self.conv_1(x), self.conv_2(x[:, :, 1:, 1:])], dim=1)
+ out = self.bn(out)
+ return out
diff --git a/darts_source/cnn/test.py b/darts_source/cnn/test.py
new file mode 100644
index 000000000..5a9997235
--- /dev/null
+++ b/darts_source/cnn/test.py
@@ -0,0 +1,115 @@
+import os
+import sys
+import glob
+import numpy as np
+import torch
+import utils
+import logging
+import argparse
+import torch.nn as nn
+import genotypes
+import torch.utils
+import torchvision.datasets as dset
+import torch.backends.cudnn as cudnn
+from torch.autograd import Variable
+from model import NetworkCIFAR as Network
+
+
+parser = argparse.ArgumentParser("cifar")
+parser.add_argument('--data', type=str, default='../data',
+ help='location of the data corpus')
+parser.add_argument('--batch_size', type=int, default=96, help='batch size')
+parser.add_argument('--report_freq', type=float,
+ default=50, help='report frequency')
+parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
+parser.add_argument('--init_channels', type=int,
+ default=36, help='num of init channels')
+parser.add_argument('--layers', type=int, default=20,
+ help='total number of layers')
+parser.add_argument('--model_path', type=str,
+ default='EXP/model.pt', help='path of pretrained model')
+parser.add_argument('--auxiliary', action='store_true',
+ default=False, help='use auxiliary tower')
+parser.add_argument('--cutout', action='store_true',
+ default=False, help='use cutout')
+parser.add_argument('--cutout_length', type=int,
+ default=16, help='cutout length')
+parser.add_argument('--drop_path_prob', type=float,
+ default=0.2, help='drop path probability')
+parser.add_argument('--seed', type=int, default=0, help='random seed')
+parser.add_argument('--arch', type=str, default='DARTS',
+ help='which architecture to use')
+args = parser.parse_args()
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+
+CIFAR_CLASSES = 10
+
+
+def main():
+ if not torch.cuda.is_available():
+ logging.info('no gpu device available')
+ sys.exit(1)
+
+ np.random.seed(args.seed)
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ torch.manual_seed(args.seed)
+ cudnn.enabled = True
+ torch.cuda.manual_seed(args.seed)
+ logging.info('gpu device = %d' % args.gpu)
+ logging.info("args = %s", args)
+
+ genotype = eval("genotypes.%s" % args.arch)
+ model = Network(args.init_channels, CIFAR_CLASSES,
+ args.layers, args.auxiliary, genotype)
+ model = model.cuda()
+ utils.load(model, args.model_path)
+
+ logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
+
+ criterion = nn.CrossEntropyLoss()
+ criterion = criterion.cuda()
+
+ _, test_transform = utils._data_transforms_cifar10(args)
+ test_data = dset.CIFAR10(root=args.data, train=False,
+ download=True, transform=test_transform)
+
+ test_queue = torch.utils.data.DataLoader(
+ test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2)
+
+ model.drop_path_prob = args.drop_path_prob
+ test_acc, test_obj = infer(test_queue, model, criterion)
+ logging.info('test_acc %f', test_acc)
+
+
+def infer(test_queue, model, criterion):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.eval()
+
+ for step, (input, target) in enumerate(test_queue):
+ input = Variable(input, volatile=True).cuda()
+ target = Variable(target, volatile=True).cuda(async=True)
+
+ logits, _ = model(input)
+ loss = criterion(logits, target)
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.data[0], n)
+ top1.update(prec1.data[0], n)
+ top5.update(prec5.data[0], n)
+
+ if step % args.report_freq == 0:
+ logging.info('test %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top1.avg, objs.avg
+
+
+if __name__ == '__main__':
+ main()
diff --git a/darts_source/cnn/test_imagenet.py b/darts_source/cnn/test_imagenet.py
new file mode 100644
index 000000000..9ffaa860a
--- /dev/null
+++ b/darts_source/cnn/test_imagenet.py
@@ -0,0 +1,124 @@
+import os
+import sys
+import numpy as np
+import torch
+import utils
+import glob
+import random
+import logging
+import argparse
+import torch.nn as nn
+import genotypes
+import torch.utils
+import torchvision.datasets as dset
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+
+from torch.autograd import Variable
+from model import NetworkImageNet as Network
+
+
+parser = argparse.ArgumentParser("imagenet")
+parser.add_argument('--data', type=str, default='../data/imagenet/',
+ help='location of the data corpus')
+parser.add_argument('--batch_size', type=int, default=128, help='batch size')
+parser.add_argument('--report_freq', type=float,
+ default=100, help='report frequency')
+parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
+parser.add_argument('--init_channels', type=int,
+ default=48, help='num of init channels')
+parser.add_argument('--layers', type=int, default=14,
+ help='total number of layers')
+parser.add_argument('--model_path', type=str,
+ default='EXP/model.pt', help='path of pretrained model')
+parser.add_argument('--auxiliary', action='store_true',
+ default=False, help='use auxiliary tower')
+parser.add_argument('--drop_path_prob', type=float,
+ default=0, help='drop path probability')
+parser.add_argument('--seed', type=int, default=0, help='random seed')
+parser.add_argument('--arch', type=str, default='DARTS',
+ help='which architecture to use')
+args = parser.parse_args()
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+
+CLASSES = 1000
+
+
+def main():
+ if not torch.cuda.is_available():
+ logging.info('no gpu device available')
+ sys.exit(1)
+
+ np.random.seed(args.seed)
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ torch.manual_seed(args.seed)
+ cudnn.enabled = True
+ torch.cuda.manual_seed(args.seed)
+ logging.info('gpu device = %d' % args.gpu)
+ logging.info("args = %s", args)
+
+ genotype = eval("genotypes.%s" % args.arch)
+ model = Network(args.init_channels, CLASSES,
+ args.layers, args.auxiliary, genotype)
+ model = model.cuda()
+ model.load_state_dict(torch.load(args.model_path)['state_dict'])
+
+ logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
+
+ criterion = nn.CrossEntropyLoss()
+ criterion = criterion.cuda()
+
+ validdir = os.path.join(args.data, 'val')
+ normalize = transforms.Normalize(
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+ valid_data = dset.ImageFolder(
+ validdir,
+ transforms.Compose([
+ transforms.Resize(256),
+ transforms.CenterCrop(224),
+ transforms.ToTensor(),
+ normalize,
+ ]))
+
+ valid_queue = torch.utils.data.DataLoader(
+ valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=4)
+
+ model.drop_path_prob = args.drop_path_prob
+ valid_acc_top1, valid_acc_top5, valid_obj = infer(
+ valid_queue, model, criterion)
+ logging.info('valid_acc_top1 %f', valid_acc_top1)
+ logging.info('valid_acc_top5 %f', valid_acc_top5)
+
+
+def infer(valid_queue, model, criterion):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.eval()
+
+ for step, (input, target) in enumerate(valid_queue):
+ input = Variable(input, volatile=True).cuda()
+ target = Variable(target, volatile=True).cuda(async=True)
+
+ logits, _ = model(input)
+ loss = criterion(logits, target)
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.data[0], n)
+ top1.update(prec1.data[0], n)
+ top5.update(prec5.data[0], n)
+
+ if step % args.report_freq == 0:
+ logging.info('valid %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top1.avg, top5.avg, objs.avg
+
+
+if __name__ == '__main__':
+ main()
diff --git a/darts_source/cnn/train.py b/darts_source/cnn/train.py
new file mode 100644
index 000000000..09b3a9101
--- /dev/null
+++ b/darts_source/cnn/train.py
@@ -0,0 +1,190 @@
+import os
+import sys
+import time
+import glob
+import numpy as np
+import torch
+import utils
+import logging
+import argparse
+import torch.nn as nn
+import genotypes
+import torch.utils
+import torchvision.datasets as dset
+import torch.backends.cudnn as cudnn
+
+from torch.autograd import Variable
+from model import NetworkCIFAR as Network
+
+
+parser = argparse.ArgumentParser("cifar")
+parser.add_argument('--data', type=str, default='../data',
+ help='location of the data corpus')
+parser.add_argument('--batch_size', type=int, default=96, help='batch size')
+parser.add_argument('--learning_rate', type=float,
+ default=0.025, help='init learning rate')
+parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
+parser.add_argument('--weight_decay', type=float,
+ default=3e-4, help='weight decay')
+parser.add_argument('--report_freq', type=float,
+ default=50, help='report frequency')
+parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
+parser.add_argument('--epochs', type=int, default=600,
+ help='num of training epochs')
+parser.add_argument('--init_channels', type=int,
+ default=36, help='num of init channels')
+parser.add_argument('--layers', type=int, default=20,
+ help='total number of layers')
+parser.add_argument('--model_path', type=str,
+ default='saved_models', help='path to save the model')
+parser.add_argument('--auxiliary', action='store_true',
+ default=False, help='use auxiliary tower')
+parser.add_argument('--auxiliary_weight', type=float,
+ default=0.4, help='weight for auxiliary loss')
+parser.add_argument('--cutout', action='store_true',
+ default=False, help='use cutout')
+parser.add_argument('--cutout_length', type=int,
+ default=16, help='cutout length')
+parser.add_argument('--drop_path_prob', type=float,
+ default=0.2, help='drop path probability')
+parser.add_argument('--save', type=str, default='EXP', help='experiment name')
+parser.add_argument('--seed', type=int, default=0, help='random seed')
+parser.add_argument('--arch', type=str, default='DARTS',
+ help='which architecture to use')
+parser.add_argument('--grad_clip', type=float,
+ default=5, help='gradient clipping')
+args = parser.parse_args()
+
+args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+
+CIFAR_CLASSES = 10
+
+
+def main():
+ if not torch.cuda.is_available():
+ logging.info('no gpu device available')
+ sys.exit(1)
+
+ np.random.seed(args.seed)
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ torch.manual_seed(args.seed)
+ cudnn.enabled = True
+ torch.cuda.manual_seed(args.seed)
+ logging.info('gpu device = %d' % args.gpu)
+ logging.info("args = %s", args)
+
+ genotype = eval("genotypes.%s" % args.arch)
+ model = Network(args.init_channels, CIFAR_CLASSES,
+ args.layers, args.auxiliary, genotype)
+ model = model.cuda()
+
+ logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
+
+ criterion = nn.CrossEntropyLoss()
+ criterion = criterion.cuda()
+ optimizer = torch.optim.SGD(
+ model.parameters(),
+ args.learning_rate,
+ momentum=args.momentum,
+ weight_decay=args.weight_decay
+ )
+
+ train_transform, valid_transform = utils._data_transforms_cifar10(args)
+ train_data = dset.CIFAR10(
+ root=args.data, train=True, download=True, transform=train_transform)
+ valid_data = dset.CIFAR10(
+ root=args.data, train=False, download=True, transform=valid_transform)
+
+ train_queue = torch.utils.data.DataLoader(
+ train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2)
+
+ valid_queue = torch.utils.data.DataLoader(
+ valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2)
+
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+ optimizer, float(args.epochs))
+
+ for epoch in range(args.epochs):
+ scheduler.step()
+ logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
+ model.drop_path_prob = args.drop_path_prob * epoch / args.epochs
+
+ train_acc, train_obj = train(train_queue, model, criterion, optimizer)
+ logging.info('train_acc %f', train_acc)
+
+ valid_acc, valid_obj = infer(valid_queue, model, criterion)
+ logging.info('valid_acc %f', valid_acc)
+
+ utils.save(model, os.path.join(args.save, 'weights.pt'))
+
+
+def train(train_queue, model, criterion, optimizer):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.train()
+
+ for step, (input, target) in enumerate(train_queue):
+ input = Variable(input).cuda()
+ target = Variable(target).cuda(async=True)
+
+ optimizer.zero_grad()
+ logits, logits_aux = model(input)
+ loss = criterion(logits, target)
+ if args.auxiliary:
+ loss_aux = criterion(logits_aux, target)
+ loss += args.auxiliary_weight*loss_aux
+ loss.backward()
+ nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
+ optimizer.step()
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.data[0], n)
+ top1.update(prec1.data[0], n)
+ top5.update(prec5.data[0], n)
+
+ if step % args.report_freq == 0:
+ logging.info('train %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top1.avg, objs.avg
+
+
+def infer(valid_queue, model, criterion):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.eval()
+
+ for step, (input, target) in enumerate(valid_queue):
+ input = Variable(input, volatile=True).cuda()
+ target = Variable(target, volatile=True).cuda(async=True)
+
+ logits, _ = model(input)
+ loss = criterion(logits, target)
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.data[0], n)
+ top1.update(prec1.data[0], n)
+ top5.update(prec5.data[0], n)
+
+ if step % args.report_freq == 0:
+ logging.info('valid %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top1.avg, objs.avg
+
+
+if __name__ == '__main__':
+ main()
diff --git a/darts_source/cnn/train_imagenet.py b/darts_source/cnn/train_imagenet.py
new file mode 100644
index 000000000..fdaa81b7d
--- /dev/null
+++ b/darts_source/cnn/train_imagenet.py
@@ -0,0 +1,255 @@
+import os
+import sys
+import numpy as np
+import time
+import torch
+import utils
+import glob
+import random
+import logging
+import argparse
+import torch.nn as nn
+import genotypes
+import torch.utils
+import torchvision.datasets as dset
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+
+from torch.autograd import Variable
+from model import NetworkImageNet as Network
+
+
+parser = argparse.ArgumentParser("imagenet")
+parser.add_argument('--data', type=str, default='../data/imagenet/',
+ help='location of the data corpus')
+parser.add_argument('--batch_size', type=int, default=128, help='batch size')
+parser.add_argument('--learning_rate', type=float,
+ default=0.1, help='init learning rate')
+parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
+parser.add_argument('--weight_decay', type=float,
+ default=3e-5, help='weight decay')
+parser.add_argument('--report_freq', type=float,
+ default=100, help='report frequency')
+parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
+parser.add_argument('--epochs', type=int, default=250,
+ help='num of training epochs')
+parser.add_argument('--init_channels', type=int,
+ default=48, help='num of init channels')
+parser.add_argument('--layers', type=int, default=14,
+ help='total number of layers')
+parser.add_argument('--auxiliary', action='store_true',
+ default=False, help='use auxiliary tower')
+parser.add_argument('--auxiliary_weight', type=float,
+ default=0.4, help='weight for auxiliary loss')
+parser.add_argument('--drop_path_prob', type=float,
+ default=0, help='drop path probability')
+parser.add_argument('--save', type=str, default='EXP', help='experiment name')
+parser.add_argument('--seed', type=int, default=0, help='random seed')
+parser.add_argument('--arch', type=str, default='DARTS',
+ help='which architecture to use')
+parser.add_argument('--grad_clip', type=float,
+ default=5., help='gradient clipping')
+parser.add_argument('--label_smooth', type=float,
+ default=0.1, help='label smoothing')
+parser.add_argument('--gamma', type=float, default=0.97,
+ help='learning rate decay')
+parser.add_argument('--decay_period', type=int, default=1,
+ help='epochs between two learning rate decays')
+parser.add_argument('--parallel', action='store_true',
+ default=False, help='data parallelism')
+args = parser.parse_args()
+
+args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+
+CLASSES = 1000
+
+
+class CrossEntropyLabelSmooth(nn.Module):
+
+ def __init__(self, num_classes, epsilon):
+ super(CrossEntropyLabelSmooth, self).__init__()
+ self.num_classes = num_classes
+ self.epsilon = epsilon
+ self.logsoftmax = nn.LogSoftmax(dim=1)
+
+ def forward(self, inputs, targets):
+ log_probs = self.logsoftmax(inputs)
+ targets = torch.zeros_like(log_probs).scatter_(
+ 1, targets.unsqueeze(1), 1)
+ targets = (1 - self.epsilon) * targets + \
+ self.epsilon / self.num_classes
+ loss = (-targets * log_probs).mean(0).sum()
+ return loss
+
+
+def main():
+ if not torch.cuda.is_available():
+ logging.info('no gpu device available')
+ sys.exit(1)
+
+ np.random.seed(args.seed)
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ torch.manual_seed(args.seed)
+ cudnn.enabled = True
+ torch.cuda.manual_seed(args.seed)
+ logging.info('gpu device = %d' % args.gpu)
+ logging.info("args = %s", args)
+
+ genotype = eval("genotypes.%s" % args.arch)
+ model = Network(args.init_channels, CLASSES,
+ args.layers, args.auxiliary, genotype)
+ if args.parallel:
+ model = nn.DataParallel(model).cuda()
+ else:
+ model = model.cuda()
+
+ logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
+
+ criterion = nn.CrossEntropyLoss()
+ criterion = criterion.cuda()
+ criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth)
+ criterion_smooth = criterion_smooth.cuda()
+
+ optimizer = torch.optim.SGD(
+ model.parameters(),
+ args.learning_rate,
+ momentum=args.momentum,
+ weight_decay=args.weight_decay
+ )
+
+ traindir = os.path.join(args.data, 'train')
+ validdir = os.path.join(args.data, 'val')
+ normalize = transforms.Normalize(
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+ train_data = dset.ImageFolder(
+ traindir,
+ transforms.Compose([
+ transforms.RandomResizedCrop(224),
+ transforms.RandomHorizontalFlip(),
+ transforms.ColorJitter(
+ brightness=0.4,
+ contrast=0.4,
+ saturation=0.4,
+ hue=0.2),
+ transforms.ToTensor(),
+ normalize,
+ ]))
+ valid_data = dset.ImageFolder(
+ validdir,
+ transforms.Compose([
+ transforms.Resize(256),
+ transforms.CenterCrop(224),
+ transforms.ToTensor(),
+ normalize,
+ ]))
+
+ train_queue = torch.utils.data.DataLoader(
+ train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4)
+
+ valid_queue = torch.utils.data.DataLoader(
+ valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=4)
+
+ scheduler = torch.optim.lr_scheduler.StepLR(
+ optimizer, args.decay_period, gamma=args.gamma)
+
+ best_acc_top1 = 0
+ for epoch in range(args.epochs):
+ scheduler.step()
+ logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
+ model.drop_path_prob = args.drop_path_prob * epoch / args.epochs
+
+ train_acc, train_obj = train(
+ train_queue, model, criterion_smooth, optimizer)
+ logging.info('train_acc %f', train_acc)
+
+ valid_acc_top1, valid_acc_top5, valid_obj = infer(
+ valid_queue, model, criterion)
+ logging.info('valid_acc_top1 %f', valid_acc_top1)
+ logging.info('valid_acc_top5 %f', valid_acc_top5)
+
+ is_best = False
+ if valid_acc_top1 > best_acc_top1:
+ best_acc_top1 = valid_acc_top1
+ is_best = True
+
+ utils.save_checkpoint({
+ 'epoch': epoch + 1,
+ 'state_dict': model.state_dict(),
+ 'best_acc_top1': best_acc_top1,
+ 'optimizer': optimizer.state_dict(),
+ }, is_best, args.save)
+
+
+def train(train_queue, model, criterion, optimizer):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.train()
+
+ for step, (input, target) in enumerate(train_queue):
+ target = target.cuda(async=True)
+ input = input.cuda()
+ input = Variable(input)
+ target = Variable(target)
+
+ optimizer.zero_grad()
+ logits, logits_aux = model(input)
+ loss = criterion(logits, target)
+ if args.auxiliary:
+ loss_aux = criterion(logits_aux, target)
+ loss += args.auxiliary_weight*loss_aux
+
+ loss.backward()
+ nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
+ optimizer.step()
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.data[0], n)
+ top1.update(prec1.data[0], n)
+ top5.update(prec5.data[0], n)
+
+ if step % args.report_freq == 0:
+ logging.info('train %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top1.avg, objs.avg
+
+
+def infer(valid_queue, model, criterion):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.eval()
+
+ for step, (input, target) in enumerate(valid_queue):
+ input = Variable(input, volatile=True).cuda()
+ target = Variable(target, volatile=True).cuda(async=True)
+
+ logits, _ = model(input)
+ loss = criterion(logits, target)
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.data[0], n)
+ top1.update(prec1.data[0], n)
+ top5.update(prec5.data[0], n)
+
+ if step % args.report_freq == 0:
+ logging.info('valid %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top1.avg, top5.avg, objs.avg
+
+
+if __name__ == '__main__':
+ main()
diff --git a/darts_source/cnn/train_search.py b/darts_source/cnn/train_search.py
new file mode 100644
index 000000000..f20c027de
--- /dev/null
+++ b/darts_source/cnn/train_search.py
@@ -0,0 +1,194 @@
+import os
+import sys
+import time
+import glob
+import argparse
+import logging
+
+
+import torch
+import torch.nn as nn
+import torch.utils
+import torch.nn.functional as F
+import torchvision.datasets as dset
+import torch.backends.cudnn as cudnn
+from torch.autograd import Variable
+import numpy as np
+import nni
+
+# noinspection PyUnresolvedReferences
+from model_search import Network, tuner_params
+from architect import Architect
+import utils
+import model_parser
+
+# get params from files
+args = model_parser.get_cifar_parser_params()
+# tuner_params = nni.get_next_parameter()
+
+# get where to save log
+args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+utils.create_exp_dir(os.path.join(tuner_params['output_path'], args.save, scripts_to_save=glob.glob('*.py')))
+
+
+# create log
+# basic settings
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+# create where to save log
+log_file = logging.FileHandler(os.path.join(tuner_params['output_path'],args.save,'log.txt'))
+log_file.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(log_file)
+
+
+CIFAR_CLASSES = 10
+
+
+def main():
+ # Information Output
+ # check gpu
+ logging.info("%s", tuner_params["dataset_path"])
+ if not torch.cuda.is_available():
+ logging.info('NO GPU DEVICE AVAILABLE')
+ sys.exit(1)
+ logging.info("Model Params = %s", args)
+ np.random.seed(args.seed)
+ torch.cuda.set_device(args.gpu)
+ logging.info('gpu device = %d' % args.gpu)
+
+ cudnn.benchmark = True
+ cudnn.enabled = True
+ torch.manual_seed(args.seed)
+ torch.cuda.manual_seed(args.seed)
+
+ criterion_loss = nn.CrossEntropyLoss()
+ criterion_loss = criterion_loss.cuda()
+ model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion_loss)
+
+ model = model.cuda()
+
+ logging.info("Model Param size = %fMB", utils.count_parameters(model))
+
+ optimizer = torch.optim.SGD(model.parameters(),
+ args.learning_rate,
+ momentum=args.momentum,
+ weight_decay=args.weight_decay)
+
+ train_transform, valid_transform = utils._data_transforms_cifar10(args)
+ train_data = dset.CIFAR10(
+ root=tuner_params["dataset_path"], train=True, download=True,
+ transform=train_transform)
+
+ num_train = len(train_data)
+ indices = list(range(num_train))
+ split = int(np.floor(args.train_portion * num_train))
+
+ train_dataloader = torch.utils.data.DataLoader(
+ train_data, batch_size=args.batch_size,
+ sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
+ pin_memory=True, num_workers=2)
+
+ valid_dataloader = torch.utils.data.DataLoader(
+ train_data, batch_size=args.batch_size,
+ sampler=torch.utils.data.sampler.SubsetRandomSampler(
+ indices[split:num_train]),
+ pin_memory=True, num_workers=2)
+
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+ optimizer, float(args.epochs), eta_min=args.learning_rate_min)
+
+ architect = Architect(model, args)
+
+ for epoch in range(args.epochs):
+ scheduler.step()
+ lr = scheduler.get_lr()[0]
+
+ genotype = model.genotype()
+ logging.info('epoch %d lr %e', epoch, lr)
+ logging.info('genotype = %s', genotype)
+
+ print(F.softmax(model.alphas_normal, dim=-1))
+ print(F.softmax(model.alphas_reduce, dim=-1))
+
+ # training
+ train_acc, train_obj = train(train_dataloader, valid_dataloader, model, architect, criterion_loss, optimizer, lr)
+
+ # validation
+ valid_acc, valid_obj = val(valid_dataloader, model, criterion_loss)
+ logging.info('train_acc %f', train_acc)
+ logging.info('valid_acc %f', valid_acc)
+
+ utils.save(model, os.path.join(tuner_params['output_path'], args.save, 'weights.pt'))
+
+
+def train(train_dataloader, valid_dataloader, model, architect, criterion_loss, optimizer, lr):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+
+ for step, (input, target) in enumerate(train_dataloader):
+ model.train()
+ n = input.size(0)
+
+ input = Variable(input, requires_grad=False).cuda()
+ target = Variable(target, requires_grad=False).cuda(async=True)
+
+ # get a random minibatch from the search queue with replacement
+ input_search, target_search = next(iter(valid_dataloader))
+ input_search = Variable(input_search, requires_grad=False).cuda()
+ target_search = Variable(target_search, requires_grad=False).cuda(async=True)
+
+ architect.step(input, target, input_search, target_search,
+ lr, optimizer, unrolled=args.unrolled)
+
+ optimizer.zero_grad()
+ logits = model(input)
+ loss = criterion_loss(logits, target)
+
+ loss.backward()
+ nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
+ optimizer.step()
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ objs.update(loss.item(), n)
+ top1.update(prec1.item(), n)
+ top5.update(prec5.item(), n)
+
+ if step % args.report_freq == 0:
+ logging.info('train %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+ nni.report_intermediate_result(objs.avg)
+
+ return top5.avg, objs.avg
+
+
+def val(valid_dataloader, model, criterion_loss):
+ objs = utils.AvgrageMeter()
+ top1 = utils.AvgrageMeter()
+ top5 = utils.AvgrageMeter()
+ model.eval()
+
+ for step, (input, target) in enumerate(valid_dataloader):
+ with torch.no_grad():
+ input = Variable(input).cuda()
+ target = Variable(target).cuda(async=True)
+
+ logits = model(input)
+ loss = criterion_loss(logits, target)
+
+ prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
+ n = input.size(0)
+ objs.update(loss.item(), n)
+ top1.update(prec1.item(), n)
+ top5.update(prec5.item(), n)
+
+ if step % args.report_freq == 0:
+ logging.info('valid %03d %e %f %f', step,
+ objs.avg, top1.avg, top5.avg)
+
+ return top5.avg, objs.avg
+
+
+if __name__ == '__main__':
+ main()
diff --git a/darts_source/cnn/utils.py b/darts_source/cnn/utils.py
new file mode 100644
index 000000000..58bb733a0
--- /dev/null
+++ b/darts_source/cnn/utils.py
@@ -0,0 +1,130 @@
+"""
+File: utils.py
+Author: OccamRazer
+Email: vincent.duan95@outlook.com
+Github: https://github.com/VDeamoV
+Description: Some Fundamental code placed here for reuse
+"""
+
+import os
+import shutil
+
+
+import numpy as np
+import torch
+from torch.autograd import Variable
+import torchvision.transforms as transforms
+
+
+class AvgrageMeter(object):
+ def __init__(self):
+ self.reset()
+
+ def reset(self):
+ self.avg = 0
+ self.sum = 0
+ self.cnt = 0
+
+ def update(self, val, n=1):
+ self.sum += val * n
+ self.cnt += n
+ self.avg = self.sum / self.cnt
+
+
+def accuracy(output, target, topk=(1,)):
+ maxk = max(topk)
+ batch_size = target.size(0)
+
+ _, pred = output.topk(maxk, 1, True, True)
+ pred = pred.t()
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+ res = []
+ for k in topk:
+ correct_k = correct[:k].view(-1).float().sum(0)
+ res.append(correct_k.mul_(100.0/batch_size))
+ return res
+
+
+class Cutout(object):
+ def __init__(self, length):
+ self.length = length
+
+ def __call__(self, img):
+ h, w = img.size(1), img.size(2)
+ mask = np.ones((h, w), np.float32)
+ y = np.random.randint(h)
+ x = np.random.randint(w)
+
+ y1 = np.clip(y - self.length // 2, 0, h)
+ y2 = np.clip(y + self.length // 2, 0, h)
+ x1 = np.clip(x - self.length // 2, 0, w)
+ x2 = np.clip(x + self.length // 2, 0, w)
+
+ mask[y1: y2, x1: x2] = 0.
+ mask = torch.from_numpy(mask)
+ mask = mask.expand_as(img)
+ img *= mask
+ return img
+
+
+def _data_transforms_cifar10(args):
+ CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+ CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+ train_transform = transforms.Compose([
+ transforms.RandomCrop(32, padding=4),
+ transforms.RandomHorizontalFlip(),
+ transforms.ToTensor(),
+ transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+ ])
+ if args.cutout:
+ train_transform.transforms.append(Cutout(args.cutout_length))
+
+ valid_transform = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+ ])
+ return train_transform, valid_transform
+
+
+def count_parameters(model):
+ return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6
+
+
+def save_checkpoint(state, is_best, save):
+ filename = os.path.join(save, 'checkpoint.pth.tar')
+ torch.save(state, filename)
+ if is_best:
+ best_filename = os.path.join(save, 'model_best.pth.tar')
+ shutil.copyfile(filename, best_filename)
+
+
+def save(model, model_path):
+ torch.save(model.state_dict(), model_path)
+
+
+def load(model, model_path):
+ model.load_state_dict(torch.load(model_path))
+
+
+def drop_path(x, drop_prob):
+ if drop_prob > 0.:
+ keep_prob = 1.-drop_prob
+ mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob))
+ x.div_(keep_prob)
+ x.mul_(mask)
+ return x
+
+
+def create_exp_dir(path, scripts_to_save=None):
+ if not os.path.exists(path):
+ os.mkdir(path)
+
+ print('Experiment Log Path : {}'.format(path))
+
+ if scripts_to_save is not None:
+ os.mkdir(os.path.join(path, 'scripts'))
+ for script in scripts_to_save:
+ dst_file = os.path.join(path, 'scripts', os.path.basename(script))
+ shutil.copyfile(script, dst_file)
diff --git a/darts_source/cnn/visualize.py b/darts_source/cnn/visualize.py
new file mode 100644
index 000000000..8c25a3980
--- /dev/null
+++ b/darts_source/cnn/visualize.py
@@ -0,0 +1,55 @@
+import sys
+import genotypes
+from graphviz import Digraph
+
+
+def plot(genotype, filename):
+ g = Digraph(
+ format='pdf',
+ edge_attr=dict(fontsize='20', fontname="times"),
+ node_attr=dict(style='filled', shape='rect', align='center', fontsize='20',
+ height='0.5', width='0.5', penwidth='2', fontname="times"),
+ engine='dot')
+ g.body.extend(['rankdir=LR'])
+
+ g.node("c_{k-2}", fillcolor='darkseagreen2')
+ g.node("c_{k-1}", fillcolor='darkseagreen2')
+ assert len(genotype) % 2 == 0
+ steps = len(genotype) // 2
+
+ for i in range(steps):
+ g.node(str(i), fillcolor='lightblue')
+
+ for i in range(steps):
+ for k in [2*i, 2*i + 1]:
+ op, j = genotype[k]
+ if j == 0:
+ u = "c_{k-2}"
+ elif j == 1:
+ u = "c_{k-1}"
+ else:
+ u = str(j-2)
+ v = str(i)
+ g.edge(u, v, label=op, fillcolor="gray")
+
+ g.node("c_{k}", fillcolor='palegoldenrod')
+ for i in range(steps):
+ g.edge(str(i), "c_{k}", fillcolor="gray")
+
+ g.render(filename, view=True)
+
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print("usage:\n python {} ARCH_NAME".format(sys.argv[0]))
+ sys.exit(1)
+
+ genotype_name = sys.argv[1]
+ try:
+ genotype = eval('genotypes.{}'.format(genotype_name))
+ except AttributeError:
+ print("{} is not specified in genotypes.py".format(genotype_name))
+ sys.exit(1)
+
+ plot(genotype.normal, "normal")
+ plot(genotype.reduce, "reduction")
diff --git a/darts_source/data/cifar-10-batches-py/data_batch_4 b/darts_source/data/cifar-10-batches-py/data_batch_4
new file mode 100644
index 000000000..5b0af104f
Binary files /dev/null and b/darts_source/data/cifar-10-batches-py/data_batch_4 differ
diff --git a/darts_source/data/cifar-10-batches-py/test_batch b/darts_source/data/cifar-10-batches-py/test_batch
new file mode 100644
index 000000000..a650e54ea
Binary files /dev/null and b/darts_source/data/cifar-10-batches-py/test_batch differ
diff --git a/darts_source/rnn/architect.py b/darts_source/rnn/architect.py
new file mode 100644
index 000000000..101882fa6
--- /dev/null
+++ b/darts_source/rnn/architect.py
@@ -0,0 +1,113 @@
+import torch
+import numpy as np
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+def _concat(xs):
+ return torch.cat([x.view(-1) for x in xs])
+
+
+def _clip(grads, max_norm):
+ total_norm = 0
+ for g in grads:
+ param_norm = g.data.norm(2)
+ total_norm += param_norm ** 2
+ total_norm = total_norm ** 0.5
+ clip_coef = max_norm / (total_norm + 1e-6)
+ if clip_coef < 1:
+ for g in grads:
+ g.data.mul_(clip_coef)
+ return clip_coef
+
+
+class Architect(object):
+
+ def __init__(self, model, args):
+ self.network_weight_decay = args.wdecay
+ self.network_clip = args.clip
+ self.model = model
+ self.optimizer = torch.optim.Adam(self.model.arch_parameters(), lr=args.arch_lr, weight_decay=args.arch_wdecay)
+
+ def _compute_unrolled_model(self, hidden, input, target, eta):
+ loss, hidden_next = self.model._loss(hidden, input, target)
+ theta = _concat(self.model.parameters()).data
+ grads = torch.autograd.grad(loss, self.model.parameters())
+ clip_coef = _clip(grads, self.network_clip)
+ dtheta = _concat(grads).data + self.network_weight_decay*theta
+ unrolled_model = self._construct_model_from_theta(theta.sub(eta, dtheta))
+ return unrolled_model, clip_coef
+
+ def step(self,
+ hidden_train, input_train, target_train,
+ hidden_valid, input_valid, target_valid,
+ network_optimizer, unrolled):
+ eta = network_optimizer.param_groups[0]['lr']
+ self.optimizer.zero_grad()
+ if unrolled:
+ hidden = self._backward_step_unrolled(hidden_train, input_train, target_train, hidden_valid, input_valid, target_valid, eta)
+ else:
+ hidden = self._backward_step(hidden_valid, input_valid, target_valid)
+ self.optimizer.step()
+ return hidden, None
+
+ def _backward_step(self, hidden, input, target):
+ loss, hidden_next = self.model._loss(hidden, input, target)
+ loss.backward()
+ return hidden_next
+
+ def _backward_step_unrolled(self,
+ hidden_train, input_train, target_train,
+ hidden_valid, input_valid, target_valid, eta):
+ unrolled_model, clip_coef = self._compute_unrolled_model(hidden_train, input_train, target_train, eta)
+ unrolled_loss, hidden_next = unrolled_model._loss(hidden_valid, input_valid, target_valid)
+
+ unrolled_loss.backward()
+ dalpha = [v.grad for v in unrolled_model.arch_parameters()]
+ dtheta = [v.grad for v in unrolled_model.parameters()]
+ _clip(dtheta, self.network_clip)
+ vector = [dt.data for dt in dtheta]
+ implicit_grads = self._hessian_vector_product(vector, hidden_train, input_train, target_train, r=1e-2)
+
+ for g, ig in zip(dalpha, implicit_grads):
+ g.data.sub_(eta * clip_coef, ig.data)
+
+ for v, g in zip(self.model.arch_parameters(), dalpha):
+ if v.grad is None:
+ v.grad = Variable(g.data)
+ else:
+ v.grad.data.copy_(g.data)
+ return hidden_next
+
+ def _construct_model_from_theta(self, theta):
+ model_new = self.model.new()
+ model_dict = self.model.state_dict()
+
+ params, offset = {}, 0
+ for k, v in self.model.named_parameters():
+ v_length = np.prod(v.size())
+ params[k] = theta[offset: offset+v_length].view(v.size())
+ offset += v_length
+
+ assert offset == len(theta)
+ model_dict.update(params)
+ model_new.load_state_dict(model_dict)
+ return model_new.cuda()
+
+ def _hessian_vector_product(self, vector, hidden, input, target, r=1e-2):
+ R = r / _concat(vector).norm()
+ for p, v in zip(self.model.parameters(), vector):
+ p.data.add_(R, v)
+ loss, _ = self.model._loss(hidden, input, target)
+ grads_p = torch.autograd.grad(loss, self.model.arch_parameters())
+
+ for p, v in zip(self.model.parameters(), vector):
+ p.data.sub_(2*R, v)
+ loss, _ = self.model._loss(hidden, input, target)
+ grads_n = torch.autograd.grad(loss, self.model.arch_parameters())
+
+ for p, v in zip(self.model.parameters(), vector):
+ p.data.add_(R, v)
+
+ return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]
+
diff --git a/darts_source/rnn/data.py b/darts_source/rnn/data.py
new file mode 100644
index 000000000..7caace5d3
--- /dev/null
+++ b/darts_source/rnn/data.py
@@ -0,0 +1,128 @@
+import os
+import torch
+
+from collections import Counter
+
+
+class Dictionary(object):
+ def __init__(self):
+ self.word2idx = {}
+ self.idx2word = []
+ self.counter = Counter()
+ self.total = 0
+
+ def add_word(self, word):
+ if word not in self.word2idx:
+ self.idx2word.append(word)
+ self.word2idx[word] = len(self.idx2word) - 1
+ token_id = self.word2idx[word]
+ self.counter[token_id] += 1
+ self.total += 1
+ return self.word2idx[word]
+
+ def __len__(self):
+ return len(self.idx2word)
+
+
+class Corpus(object):
+ def __init__(self, path):
+ self.dictionary = Dictionary()
+ self.train = self.tokenize(os.path.join(path, 'train.txt'))
+ self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+ self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+ def tokenize(self, path):
+ """Tokenizes a text file."""
+ assert os.path.exists(path)
+ # Add words to the dictionary
+ with open(path, 'r', encoding='utf-8') as f:
+ tokens = 0
+ for line in f:
+ words = line.split() + ['']
+ tokens += len(words)
+ for word in words:
+ self.dictionary.add_word(word)
+
+ # Tokenize file content
+ with open(path, 'r', encoding='utf-8') as f:
+ ids = torch.LongTensor(tokens)
+ token = 0
+ for line in f:
+ words = line.split() + ['']
+ for word in words:
+ ids[token] = self.dictionary.word2idx[word]
+ token += 1
+
+ return ids
+
+class SentCorpus(object):
+ def __init__(self, path):
+ self.dictionary = Dictionary()
+ self.train = self.tokenize(os.path.join(path, 'train.txt'))
+ self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+ self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+ def tokenize(self, path):
+ """Tokenizes a text file."""
+ assert os.path.exists(path)
+ # Add words to the dictionary
+ with open(path, 'r', encoding='utf-8') as f:
+ tokens = 0
+ for line in f:
+ words = line.split() + ['']
+ tokens += len(words)
+ for word in words:
+ self.dictionary.add_word(word)
+
+ # Tokenize file content
+ sents = []
+ with open(path, 'r', encoding='utf-8') as f:
+ for line in f:
+ if not line:
+ continue
+ words = line.split() + ['']
+ sent = torch.LongTensor(len(words))
+ for i, word in enumerate(words):
+ sent[i] = self.dictionary.word2idx[word]
+ sents.append(sent)
+
+ return sents
+
+class BatchSentLoader(object):
+ def __init__(self, sents, batch_size, pad_id=0, cuda=False, volatile=False):
+ self.sents = sents
+ self.batch_size = batch_size
+ self.sort_sents = sorted(sents, key=lambda x: x.size(0))
+ self.cuda = cuda
+ self.volatile = volatile
+ self.pad_id = pad_id
+
+ def __next__(self):
+ if self.idx >= len(self.sort_sents):
+ raise StopIteration
+
+ batch_size = min(self.batch_size, len(self.sort_sents)-self.idx)
+ batch = self.sort_sents[self.idx:self.idx+batch_size]
+ max_len = max([s.size(0) for s in batch])
+ tensor = torch.LongTensor(max_len, batch_size).fill_(self.pad_id)
+ for i in range(len(batch)):
+ s = batch[i]
+ tensor[:s.size(0),i].copy_(s)
+ if self.cuda:
+ tensor = tensor.cuda()
+
+ self.idx += batch_size
+
+ return tensor
+
+ next = __next__
+
+ def __iter__(self):
+ self.idx = 0
+ return self
+
+if __name__ == '__main__':
+ corpus = SentCorpus('../penn')
+ loader = BatchSentLoader(corpus.test, 10)
+ for i, d in enumerate(loader):
+ print(i, d.size())
diff --git a/darts_source/rnn/genotypes.py b/darts_source/rnn/genotypes.py
new file mode 100644
index 000000000..b278be170
--- /dev/null
+++ b/darts_source/rnn/genotypes.py
@@ -0,0 +1,36 @@
+from collections import namedtuple
+
+Genotype = namedtuple('Genotype', 'recurrent concat')
+
+PRIMITIVES = [
+ 'none',
+ 'tanh',
+ 'relu',
+ 'sigmoid',
+ 'identity'
+]
+STEPS = 8
+CONCAT = 8
+
+ENAS = Genotype(
+ recurrent = [
+ ('tanh', 0),
+ ('tanh', 1),
+ ('relu', 1),
+ ('tanh', 3),
+ ('tanh', 3),
+ ('relu', 3),
+ ('relu', 4),
+ ('relu', 7),
+ ('relu', 8),
+ ('relu', 8),
+ ('relu', 8),
+ ],
+ concat = [2, 5, 6, 9, 10, 11]
+)
+
+DARTS_V1 = Genotype(recurrent=[('relu', 0), ('relu', 1), ('tanh', 2), ('relu', 3), ('relu', 4), ('identity', 1), ('relu', 5), ('relu', 1)], concat=range(1, 9))
+DARTS_V2 = Genotype(recurrent=[('sigmoid', 0), ('relu', 1), ('relu', 1), ('identity', 1), ('tanh', 2), ('sigmoid', 5), ('tanh', 3), ('relu', 5)], concat=range(1, 9))
+
+DARTS = DARTS_V2
+
diff --git a/darts_source/rnn/model.py b/darts_source/rnn/model.py
new file mode 100644
index 000000000..42e4f2160
--- /dev/null
+++ b/darts_source/rnn/model.py
@@ -0,0 +1,160 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from genotypes import STEPS
+from utils import mask2d
+from utils import LockedDropout
+from utils import embedded_dropout
+from torch.autograd import Variable
+
+INITRANGE = 0.04
+
+
+class DARTSCell(nn.Module):
+
+ def __init__(self, ninp, nhid, dropouth, dropoutx, genotype):
+ super(DARTSCell, self).__init__()
+ self.nhid = nhid
+ self.dropouth = dropouth
+ self.dropoutx = dropoutx
+ self.genotype = genotype
+
+ # genotype is None when doing arch search
+ steps = len(self.genotype.recurrent) if self.genotype is not None else STEPS
+ self._W0 = nn.Parameter(torch.Tensor(ninp+nhid, 2*nhid).uniform_(-INITRANGE, INITRANGE))
+ self._Ws = nn.ParameterList([
+ nn.Parameter(torch.Tensor(nhid, 2*nhid).uniform_(-INITRANGE, INITRANGE)) for i in range(steps)
+ ])
+
+ def forward(self, inputs, hidden):
+ T, B = inputs.size(0), inputs.size(1)
+
+ if self.training:
+ x_mask = mask2d(B, inputs.size(2), keep_prob=1.-self.dropoutx)
+ h_mask = mask2d(B, hidden.size(2), keep_prob=1.-self.dropouth)
+ else:
+ x_mask = h_mask = None
+
+ hidden = hidden[0]
+ hiddens = []
+ for t in range(T):
+ hidden = self.cell(inputs[t], hidden, x_mask, h_mask)
+ hiddens.append(hidden)
+ hiddens = torch.stack(hiddens)
+ return hiddens, hiddens[-1].unsqueeze(0)
+
+ def _compute_init_state(self, x, h_prev, x_mask, h_mask):
+ if self.training:
+ xh_prev = torch.cat([x * x_mask, h_prev * h_mask], dim=-1)
+ else:
+ xh_prev = torch.cat([x, h_prev], dim=-1)
+ c0, h0 = torch.split(xh_prev.mm(self._W0), self.nhid, dim=-1)
+ c0 = c0.sigmoid()
+ h0 = h0.tanh()
+ s0 = h_prev + c0 * (h0-h_prev)
+ return s0
+
+ def _get_activation(self, name):
+ if name == 'tanh':
+ f = F.tanh
+ elif name == 'relu':
+ f = F.relu
+ elif name == 'sigmoid':
+ f = F.sigmoid
+ elif name == 'identity':
+ f = lambda x: x
+ else:
+ raise NotImplementedError
+ return f
+
+ def cell(self, x, h_prev, x_mask, h_mask):
+ s0 = self._compute_init_state(x, h_prev, x_mask, h_mask)
+
+ states = [s0]
+ for i, (name, pred) in enumerate(self.genotype.recurrent):
+ s_prev = states[pred]
+ if self.training:
+ ch = (s_prev * h_mask).mm(self._Ws[i])
+ else:
+ ch = s_prev.mm(self._Ws[i])
+ c, h = torch.split(ch, self.nhid, dim=-1)
+ c = c.sigmoid()
+ fn = self._get_activation(name)
+ h = fn(h)
+ s = s_prev + c * (h-s_prev)
+ states += [s]
+ output = torch.mean(torch.stack([states[i] for i in self.genotype.concat], -1), -1)
+ return output
+
+
+class RNNModel(nn.Module):
+ """Container module with an encoder, a recurrent module, and a decoder."""
+
+ def __init__(self, ntoken, ninp, nhid, nhidlast,
+ dropout=0.5, dropouth=0.5, dropoutx=0.5, dropouti=0.5, dropoute=0.1,
+ cell_cls=DARTSCell, genotype=None):
+ super(RNNModel, self).__init__()
+ self.lockdrop = LockedDropout()
+ self.encoder = nn.Embedding(ntoken, ninp)
+
+ assert ninp == nhid == nhidlast
+ if cell_cls == DARTSCell:
+ assert genotype is not None
+ self.rnns = [cell_cls(ninp, nhid, dropouth, dropoutx, genotype)]
+ else:
+ assert genotype is None
+ self.rnns = [cell_cls(ninp, nhid, dropouth, dropoutx)]
+
+ self.rnns = torch.nn.ModuleList(self.rnns)
+ self.decoder = nn.Linear(ninp, ntoken)
+ self.decoder.weight = self.encoder.weight
+ self.init_weights()
+
+ self.ninp = ninp
+ self.nhid = nhid
+ self.nhidlast = nhidlast
+ self.dropout = dropout
+ self.dropouti = dropouti
+ self.dropoute = dropoute
+ self.ntoken = ntoken
+ self.cell_cls = cell_cls
+
+ def init_weights(self):
+ self.encoder.weight.data.uniform_(-INITRANGE, INITRANGE)
+ self.decoder.bias.data.fill_(0)
+ self.decoder.weight.data.uniform_(-INITRANGE, INITRANGE)
+
+ def forward(self, input, hidden, return_h=False):
+ batch_size = input.size(1)
+
+ emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if self.training else 0)
+ emb = self.lockdrop(emb, self.dropouti)
+
+ raw_output = emb
+ new_hidden = []
+ raw_outputs = []
+ outputs = []
+ for l, rnn in enumerate(self.rnns):
+ current_input = raw_output
+ raw_output, new_h = rnn(raw_output, hidden[l])
+ new_hidden.append(new_h)
+ raw_outputs.append(raw_output)
+ hidden = new_hidden
+
+ output = self.lockdrop(raw_output, self.dropout)
+ outputs.append(output)
+
+ logit = self.decoder(output.view(-1, self.ninp))
+ log_prob = nn.functional.log_softmax(logit, dim=-1)
+ model_output = log_prob
+ model_output = model_output.view(-1, batch_size, self.ntoken)
+
+ if return_h:
+ return model_output, hidden, raw_outputs, outputs
+ return model_output, hidden
+
+ def init_hidden(self, bsz):
+ weight = next(self.parameters()).data
+ return [Variable(weight.new(1, bsz, self.nhid).zero_())]
+
diff --git a/darts_source/rnn/model_search.py b/darts_source/rnn/model_search.py
new file mode 100644
index 000000000..b651415b5
--- /dev/null
+++ b/darts_source/rnn/model_search.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from genotypes import PRIMITIVES, STEPS, CONCAT, Genotype
+from torch.autograd import Variable
+from collections import namedtuple
+from model import DARTSCell, RNNModel
+
+
+class DARTSCellSearch(DARTSCell):
+
+ def __init__(self, ninp, nhid, dropouth, dropoutx):
+ super(DARTSCellSearch, self).__init__(ninp, nhid, dropouth, dropoutx, genotype=None)
+ self.bn = nn.BatchNorm1d(nhid, affine=False)
+
+ def cell(self, x, h_prev, x_mask, h_mask):
+ s0 = self._compute_init_state(x, h_prev, x_mask, h_mask)
+ s0 = self.bn(s0)
+ probs = F.softmax(self.weights, dim=-1)
+
+ offset = 0
+ states = s0.unsqueeze(0)
+ for i in range(STEPS):
+ if self.training:
+ masked_states = states * h_mask.unsqueeze(0)
+ else:
+ masked_states = states
+ ch = masked_states.view(-1, self.nhid).mm(self._Ws[i]).view(i+1, -1, 2*self.nhid)
+ c, h = torch.split(ch, self.nhid, dim=-1)
+ c = c.sigmoid()
+
+ s = torch.zeros_like(s0)
+ for k, name in enumerate(PRIMITIVES):
+ if name == 'none':
+ continue
+ fn = self._get_activation(name)
+ unweighted = states + c * (fn(h) - states)
+ s += torch.sum(probs[offset:offset+i+1, k].unsqueeze(-1).unsqueeze(-1) * unweighted, dim=0)
+ s = self.bn(s)
+ states = torch.cat([states, s.unsqueeze(0)], 0)
+ offset += i+1
+ output = torch.mean(states[-CONCAT:], dim=0)
+ return output
+
+
+class RNNModelSearch(RNNModel):
+
+ def __init__(self, *args):
+ super(RNNModelSearch, self).__init__(*args, cell_cls=DARTSCellSearch, genotype=None)
+ self._args = args
+ self._initialize_arch_parameters()
+
+ def new(self):
+ model_new = RNNModelSearch(*self._args)
+ for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
+ x.data.copy_(y.data)
+ return model_new
+
+ def _initialize_arch_parameters(self):
+ k = sum(i for i in range(1, STEPS+1))
+ weights_data = torch.randn(k, len(PRIMITIVES)).mul_(1e-3)
+ self.weights = Variable(weights_data.cuda(), requires_grad=True)
+ self._arch_parameters = [self.weights]
+ for rnn in self.rnns:
+ rnn.weights = self.weights
+
+ def arch_parameters(self):
+ return self._arch_parameters
+
+ def _loss(self, hidden, input, target):
+ log_prob, hidden_next = self(input, hidden, return_h=False)
+ loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), target)
+ return loss, hidden_next
+
+ def genotype(self):
+
+ def _parse(probs):
+ gene = []
+ start = 0
+ for i in range(STEPS):
+ end = start + i + 1
+ W = probs[start:end].copy()
+ j = sorted(range(i + 1), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[0]
+ k_best = None
+ for k in range(len(W[j])):
+ if k != PRIMITIVES.index('none'):
+ if k_best is None or W[j][k] > W[j][k_best]:
+ k_best = k
+ gene.append((PRIMITIVES[k_best], j))
+ start = end
+ return gene
+
+ gene = _parse(F.softmax(self.weights, dim=-1).data.cpu().numpy())
+ genotype = Genotype(recurrent=gene, concat=range(STEPS+1)[-CONCAT:])
+ return genotype
+
diff --git a/darts_source/rnn/test.py b/darts_source/rnn/test.py
new file mode 100644
index 000000000..771ac8418
--- /dev/null
+++ b/darts_source/rnn/test.py
@@ -0,0 +1,121 @@
+import argparse
+import os, sys
+import time
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+
+import data
+import model
+
+from utils import batchify, get_batch, repackage_hidden, create_exp_dir, save_checkpoint
+
+parser = argparse.ArgumentParser(description='PyTorch PennTreeBank/WikiText2 Language Model')
+parser.add_argument('--data', type=str, default='../data/penn/',
+ help='location of the data corpus')
+parser.add_argument('--emsize', type=int, default=850,
+ help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=850,
+ help='number of hidden units per layer')
+parser.add_argument('--nhidlast', type=int, default=850,
+ help='number of hidden units for the last rnn layer')
+parser.add_argument('--lr', type=float, default=20,
+ help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.25,
+ help='gradient clipping')
+parser.add_argument('--epochs', type=int, default=8000,
+ help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=64, metavar='N',
+ help='batch size')
+parser.add_argument('--bptt', type=int, default=35,
+ help='sequence length')
+parser.add_argument('--dropout', type=float, default=0.75,
+ help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--dropouth', type=float, default=0.3,
+ help='dropout for rnn layers (0 = no dropout)')
+parser.add_argument('--dropouti', type=float, default=0.2,
+ help='dropout for input embedding layers (0 = no dropout)')
+parser.add_argument('--dropoute', type=float, default=0.2,
+ help='dropout to remove words from embedding layer (0 = no dropout)')
+parser.add_argument('--seed', type=int, default=1267,
+ help='random seed')
+parser.add_argument('--nonmono', type=int, default=5,
+ help='random seed')
+parser.add_argument('--cuda', action='store_false',
+ help='use CUDA')
+parser.add_argument('--log-interval', type=int, default=200, metavar='N',
+ help='report interval')
+parser.add_argument('--model_path', type=str, default='EXP/model.pt',
+ help='path to load the pretrained model')
+parser.add_argument('--alpha', type=float, default=0,
+ help='alpha L2 regularization on RNN activation (alpha = 0 means no regularization)')
+parser.add_argument('--beta', type=float, default=1e-3,
+ help='beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)')
+parser.add_argument('--wdecay', type=float, default=5e-7,
+ help='weight decay applied to all weights')
+parser.add_argument('--continue_train', action='store_true',
+ help='continue train from a checkpoint')
+parser.add_argument('--n_experts', type=int, default=1,
+ help='number of experts')
+parser.add_argument('--max_seq_len_delta', type=int, default=20,
+ help='max sequence length')
+parser.add_argument('--gpu', type=int, default=0, help='GPU device to use')
+args = parser.parse_args()
+
+def logging(s, print_=True, log_=True):
+ print(s)
+
+# Set the random seed manually for reproducibility.
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+ if not args.cuda:
+ print("WARNING: You have a CUDA device, so you should probably run with --cuda")
+ else:
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ cudnn.enabled=True
+ torch.cuda.manual_seed_all(args.seed)
+
+
+corpus = data.Corpus(args.data)
+test_batch_size = 1
+test_data = batchify(corpus.test, test_batch_size, args)
+
+
+def evaluate(data_source, batch_size=10):
+ # Turn on evaluation mode which disables dropout.
+ model.eval()
+ total_loss = 0
+ ntokens = len(corpus.dictionary)
+ hidden = model.init_hidden(batch_size)
+ for i in range(0, data_source.size(0) - 1, args.bptt):
+ print(i, data_source.size(0)-1)
+ data, targets = get_batch(data_source, i, args, evaluation=True)
+ targets = targets.view(-1)
+
+ log_prob, hidden = parallel_model(data, hidden)
+ loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data
+
+ total_loss += loss * len(data)
+
+ hidden = repackage_hidden(hidden)
+ return total_loss[0] / len(data_source)
+
+# Load the best saved model.
+model = torch.load(args.model_path)
+
+total_params = sum(x.data.nelement() for x in model.parameters())
+logging('Args: {}'.format(args))
+logging('Model total parameters: {}'.format(total_params))
+parallel_model = model.cuda()
+
+# Run on test data.
+test_loss = evaluate(test_data, test_batch_size)
+logging('=' * 89)
+logging('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+ test_loss, math.exp(test_loss)))
+logging('=' * 89)
+
diff --git a/darts_source/rnn/train.py b/darts_source/rnn/train.py
new file mode 100644
index 000000000..da22c13b6
--- /dev/null
+++ b/darts_source/rnn/train.py
@@ -0,0 +1,322 @@
+import os
+import gc
+import sys
+import glob
+import time
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import logging
+import argparse
+import genotypes
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+import data
+import model
+
+from torch.autograd import Variable
+from utils import batchify, get_batch, repackage_hidden, create_exp_dir, save_checkpoint
+
+parser = argparse.ArgumentParser(description='PyTorch PennTreeBank/WikiText2 Language Model')
+parser.add_argument('--data', type=str, default='../data/penn/',
+ help='location of the data corpus')
+parser.add_argument('--emsize', type=int, default=850,
+ help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=850,
+ help='number of hidden units per layer')
+parser.add_argument('--nhidlast', type=int, default=850,
+ help='number of hidden units for the last rnn layer')
+parser.add_argument('--lr', type=float, default=20,
+ help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.25,
+ help='gradient clipping')
+parser.add_argument('--epochs', type=int, default=8000,
+ help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=64, metavar='N',
+ help='batch size')
+parser.add_argument('--bptt', type=int, default=35,
+ help='sequence length')
+parser.add_argument('--dropout', type=float, default=0.75,
+ help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--dropouth', type=float, default=0.25,
+ help='dropout for hidden nodes in rnn layers (0 = no dropout)')
+parser.add_argument('--dropoutx', type=float, default=0.75,
+ help='dropout for input nodes rnn layers (0 = no dropout)')
+parser.add_argument('--dropouti', type=float, default=0.2,
+ help='dropout for input embedding layers (0 = no dropout)')
+parser.add_argument('--dropoute', type=float, default=0.1,
+ help='dropout to remove words from embedding layer (0 = no dropout)')
+parser.add_argument('--seed', type=int, default=1267,
+ help='random seed')
+parser.add_argument('--nonmono', type=int, default=5,
+ help='random seed')
+parser.add_argument('--cuda', action='store_false',
+ help='use CUDA')
+parser.add_argument('--log-interval', type=int, default=200, metavar='N',
+ help='report interval')
+parser.add_argument('--save', type=str, default='EXP',
+ help='path to save the final model')
+parser.add_argument('--alpha', type=float, default=0,
+ help='alpha L2 regularization on RNN activation (alpha = 0 means no regularization)')
+parser.add_argument('--beta', type=float, default=1e-3,
+ help='beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)')
+parser.add_argument('--wdecay', type=float, default=8e-7,
+ help='weight decay applied to all weights')
+parser.add_argument('--continue_train', action='store_true',
+ help='continue train from a checkpoint')
+parser.add_argument('--small_batch_size', type=int, default=-1,
+ help='the batch size for computation. batch_size should be divisible by small_batch_size.\
+ In our implementation, we compute gradients with small_batch_size multiple times, and accumulate the gradients\
+ until batch_size is reached. An update step is then performed.')
+parser.add_argument('--max_seq_len_delta', type=int, default=20,
+ help='max sequence length')
+parser.add_argument('--single_gpu', default=True, action='store_false',
+ help='use single GPU')
+parser.add_argument('--gpu', type=int, default=0, help='GPU device to use')
+parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use')
+args = parser.parse_args()
+
+if args.nhidlast < 0:
+ args.nhidlast = args.emsize
+if args.small_batch_size < 0:
+ args.small_batch_size = args.batch_size
+
+if not args.continue_train:
+ args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+ create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+
+# Set the random seed manually for reproducibility.
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+ if not args.cuda:
+ print("WARNING: You have a CUDA device, so you should probably run with --cuda")
+ else:
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ cudnn.enabled=True
+ torch.cuda.manual_seed_all(args.seed)
+
+corpus = data.Corpus(args.data)
+
+eval_batch_size = 10
+test_batch_size = 1
+train_data = batchify(corpus.train, args.batch_size, args)
+val_data = batchify(corpus.valid, eval_batch_size, args)
+test_data = batchify(corpus.test, test_batch_size, args)
+
+
+ntokens = len(corpus.dictionary)
+if args.continue_train:
+ model = torch.load(os.path.join(args.save, 'model.pt'))
+else:
+ genotype = eval("genotypes.%s" % args.arch)
+ model = model.RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast,
+ args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute,
+ cell_cls=model.DARTSCell, genotype=genotype)
+
+if args.cuda:
+ if args.single_gpu:
+ parallel_model = model.cuda()
+ else:
+ parallel_model = nn.DataParallel(model, dim=1).cuda()
+else:
+ parallel_model = model
+
+total_params = sum(x.data.nelement() for x in model.parameters())
+logging.info('Args: {}'.format(args))
+logging.info('Model total parameters: {}'.format(total_params))
+logging.info('Genotype: {}'.format(genotype))
+
+
+def evaluate(data_source, batch_size=10):
+ # Turn on evaluation mode which disables dropout.
+ model.eval()
+ total_loss = 0
+ ntokens = len(corpus.dictionary)
+ hidden = model.init_hidden(batch_size)
+ for i in range(0, data_source.size(0) - 1, args.bptt):
+ data, targets = get_batch(data_source, i, args, evaluation=True)
+ targets = targets.view(-1)
+
+ log_prob, hidden = parallel_model(data, hidden)
+ loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data
+
+ total_loss += loss * len(data)
+
+ hidden = repackage_hidden(hidden)
+ return total_loss[0] / len(data_source)
+
+
+def train():
+ assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size'
+
+ # Turn on training mode which enables dropout.
+ total_loss = 0
+ start_time = time.time()
+ ntokens = len(corpus.dictionary)
+ hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)]
+ batch, i = 0, 0
+ while i < train_data.size(0) - 1 - 1:
+ bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
+ # Prevent excessively small or negative sequence lengths
+ seq_len = max(5, int(np.random.normal(bptt, 5)))
+ # There's a very small chance that it could select a very long sequence length resulting in OOM
+ seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)
+
+ lr2 = optimizer.param_groups[0]['lr']
+ optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
+ model.train()
+ data, targets = get_batch(train_data, i, args, seq_len=seq_len)
+
+ optimizer.zero_grad()
+
+ start, end, s_id = 0, args.small_batch_size, 0
+ while start < args.batch_size:
+ cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1)
+
+ # Starting each batch, we detach the hidden state from how it was previously produced.
+ # If we didn't, the model would try backpropagating all the way to start of the dataset.
+ hidden[s_id] = repackage_hidden(hidden[s_id])
+
+ log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True)
+ raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)
+
+ loss = raw_loss
+ # Activiation Regularization
+ if args.alpha > 0:
+ loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
+ # Temporal Activation Regularization (slowness)
+ loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
+ loss *= args.small_batch_size / args.batch_size
+ total_loss += raw_loss.data * args.small_batch_size / args.batch_size
+ loss.backward()
+
+ s_id += 1
+ start = end
+ end = start + args.small_batch_size
+
+ gc.collect()
+
+ # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
+ optimizer.step()
+
+ # total_loss += raw_loss.data
+ optimizer.param_groups[0]['lr'] = lr2
+
+ if np.isnan(total_loss[0]):
+ raise
+
+ if batch % args.log_interval == 0 and batch > 0:
+ cur_loss = total_loss[0] / args.log_interval
+ elapsed = time.time() - start_time
+ logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
+ 'loss {:5.2f} | ppl {:8.2f}'.format(
+ epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
+ elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+ total_loss = 0
+ start_time = time.time()
+ batch += 1
+ i += seq_len
+
+# Loop over epochs.
+lr = args.lr
+best_val_loss = []
+stored_loss = 100000000
+
+# At any point you can hit Ctrl + C to break out of training early.
+try:
+ if args.continue_train:
+ optimizer_state = torch.load(os.path.join(args.save, 'optimizer.pt'))
+ if 't0' in optimizer_state['param_groups'][0]:
+ optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
+ else:
+ optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
+ optimizer.load_state_dict(optimizer_state)
+ else:
+ optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
+
+ epoch = 1
+ while epoch < args.epochs + 1:
+ epoch_start_time = time.time()
+ try:
+ train()
+ except:
+ logging.info('rolling back to the previous best model ...')
+ model = torch.load(os.path.join(args.save, 'model.pt'))
+ parallel_model = model.cuda()
+
+ optimizer_state = torch.load(os.path.join(args.save, 'optimizer.pt'))
+ if 't0' in optimizer_state['param_groups'][0]:
+ optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
+ else:
+ optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
+ optimizer.load_state_dict(optimizer_state)
+
+ epoch = torch.load(os.path.join(args.save, 'misc.pt'))['epoch']
+ continue
+
+ if 't0' in optimizer.param_groups[0]:
+ tmp = {}
+ for prm in model.parameters():
+ tmp[prm] = prm.data.clone()
+ prm.data = optimizer.state[prm]['ax'].clone()
+
+ val_loss2 = evaluate(val_data)
+ logging.info('-' * 89)
+ logging.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+ 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
+ val_loss2, math.exp(val_loss2)))
+ logging.info('-' * 89)
+
+ if val_loss2 < stored_loss:
+ save_checkpoint(model, optimizer, epoch, args.save)
+ logging.info('Saving Averaged!')
+ stored_loss = val_loss2
+
+ for prm in model.parameters():
+ prm.data = tmp[prm].clone()
+
+ else:
+ val_loss = evaluate(val_data, eval_batch_size)
+ logging.info('-' * 89)
+ logging.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+ 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
+ val_loss, math.exp(val_loss)))
+ logging.info('-' * 89)
+
+ if val_loss < stored_loss:
+ save_checkpoint(model, optimizer, epoch, args.save)
+ logging.info('Saving Normal!')
+ stored_loss = val_loss
+
+ if 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])):
+ logging.info('Switching!')
+ optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
+ best_val_loss.append(val_loss)
+
+ epoch += 1
+
+except KeyboardInterrupt:
+ logging.info('-' * 89)
+ logging.info('Exiting from training early')
+
+# Load the best saved model.
+model = torch.load(os.path.join(args.save, 'model.pt'))
+parallel_model = model.cuda()
+
+# Run on test data.
+test_loss = evaluate(test_data, test_batch_size)
+logging.info('=' * 89)
+logging.info('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+ test_loss, math.exp(test_loss)))
+logging.info('=' * 89)
diff --git a/darts_source/rnn/train_search.py b/darts_source/rnn/train_search.py
new file mode 100644
index 000000000..8708622e1
--- /dev/null
+++ b/darts_source/rnn/train_search.py
@@ -0,0 +1,286 @@
+import argparse
+import os, sys, glob
+import time
+import math
+import numpy as np
+import torch
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from architect import Architect
+
+import gc
+
+import data
+import model_search as model
+
+from utils import batchify, get_batch, repackage_hidden, create_exp_dir, save_checkpoint
+
+parser = argparse.ArgumentParser(description='PyTorch PennTreeBank/WikiText2 Language Model')
+parser.add_argument('--data', type=str, default='../data/penn/',
+ help='location of the data corpus')
+parser.add_argument('--emsize', type=int, default=300,
+ help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=300,
+ help='number of hidden units per layer')
+parser.add_argument('--nhidlast', type=int, default=300,
+ help='number of hidden units for the last rnn layer')
+parser.add_argument('--lr', type=float, default=20,
+ help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.25,
+ help='gradient clipping')
+parser.add_argument('--epochs', type=int, default=50,
+ help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=256, metavar='N',
+ help='batch size')
+parser.add_argument('--bptt', type=int, default=35,
+ help='sequence length')
+parser.add_argument('--dropout', type=float, default=0.75,
+ help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--dropouth', type=float, default=0.25,
+ help='dropout for hidden nodes in rnn layers (0 = no dropout)')
+parser.add_argument('--dropoutx', type=float, default=0.75,
+ help='dropout for input nodes in rnn layers (0 = no dropout)')
+parser.add_argument('--dropouti', type=float, default=0.2,
+ help='dropout for input embedding layers (0 = no dropout)')
+parser.add_argument('--dropoute', type=float, default=0,
+ help='dropout to remove words from embedding layer (0 = no dropout)')
+parser.add_argument('--seed', type=int, default=3,
+ help='random seed')
+parser.add_argument('--nonmono', type=int, default=5,
+ help='random seed')
+parser.add_argument('--cuda', action='store_false',
+ help='use CUDA')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+ help='report interval')
+parser.add_argument('--save', type=str, default='EXP',
+ help='path to save the final model')
+parser.add_argument('--alpha', type=float, default=0,
+ help='alpha L2 regularization on RNN activation (alpha = 0 means no regularization)')
+parser.add_argument('--beta', type=float, default=1e-3,
+ help='beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)')
+parser.add_argument('--wdecay', type=float, default=5e-7,
+ help='weight decay applied to all weights')
+parser.add_argument('--continue_train', action='store_true',
+ help='continue train from a checkpoint')
+parser.add_argument('--small_batch_size', type=int, default=-1,
+ help='the batch size for computation. batch_size should be divisible by small_batch_size.\
+ In our implementation, we compute gradients with small_batch_size multiple times, and accumulate the gradients\
+ until batch_size is reached. An update step is then performed.')
+parser.add_argument('--max_seq_len_delta', type=int, default=20,
+ help='max sequence length')
+parser.add_argument('--single_gpu', default=True, action='store_false',
+ help='use single GPU')
+parser.add_argument('--gpu', type=int, default=0, help='GPU device to use')
+parser.add_argument('--unrolled', action='store_true', default=False, help='use one-step unrolled validation loss')
+parser.add_argument('--arch_wdecay', type=float, default=1e-3,
+ help='weight decay for the architecture encoding alpha')
+parser.add_argument('--arch_lr', type=float, default=3e-3,
+ help='learning rate for the architecture encoding alpha')
+args = parser.parse_args()
+
+if args.nhidlast < 0:
+ args.nhidlast = args.emsize
+if args.small_batch_size < 0:
+ args.small_batch_size = args.batch_size
+
+if not args.continue_train:
+ args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+ create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+ format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+
+# Set the random seed manually for reproducibility.
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+ if not args.cuda:
+ print("WARNING: You have a CUDA device, so you should probably run with --cuda")
+ else:
+ torch.cuda.set_device(args.gpu)
+ cudnn.benchmark = True
+ cudnn.enabled=True
+ torch.cuda.manual_seed_all(args.seed)
+
+corpus = data.Corpus(args.data)
+
+eval_batch_size = 10
+test_batch_size = 1
+
+train_data = batchify(corpus.train, args.batch_size, args)
+search_data = batchify(corpus.valid, args.batch_size, args)
+val_data = batchify(corpus.valid, eval_batch_size, args)
+test_data = batchify(corpus.test, test_batch_size, args)
+
+
+ntokens = len(corpus.dictionary)
+if args.continue_train:
+ model = torch.load(os.path.join(args.save, 'model.pt'))
+else:
+ model = model.RNNModelSearch(ntokens, args.emsize, args.nhid, args.nhidlast,
+ args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute)
+
+size = 0
+for p in model.parameters():
+ size += p.nelement()
+logging.info('param size: {}'.format(size))
+logging.info('initial genotype:')
+logging.info(model.genotype())
+
+if args.cuda:
+ if args.single_gpu:
+ parallel_model = model.cuda()
+ else:
+ parallel_model = nn.DataParallel(model, dim=1).cuda()
+else:
+ parallel_model = model
+architect = Architect(parallel_model, args)
+
+total_params = sum(x.data.nelement() for x in model.parameters())
+logging.info('Args: {}'.format(args))
+logging.info('Model total parameters: {}'.format(total_params))
+
+
+def evaluate(data_source, batch_size=10):
+ # Turn on evaluation mode which disables dropout.
+ model.eval()
+ total_loss = 0
+ ntokens = len(corpus.dictionary)
+ hidden = model.init_hidden(batch_size)
+ for i in range(0, data_source.size(0) - 1, args.bptt):
+ data, targets = get_batch(data_source, i, args, evaluation=True)
+ targets = targets.view(-1)
+
+ log_prob, hidden = parallel_model(data, hidden)
+ loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data
+
+ total_loss += loss * len(data)
+
+ hidden = repackage_hidden(hidden)
+ return total_loss[0] / len(data_source)
+
+
+def train():
+ assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size'
+
+ # Turn on training mode which enables dropout.
+ total_loss = 0
+ start_time = time.time()
+ ntokens = len(corpus.dictionary)
+ hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)]
+ hidden_valid = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)]
+ batch, i = 0, 0
+ while i < train_data.size(0) - 1 - 1:
+ bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
+ # Prevent excessively small or negative sequence lengths
+ # seq_len = max(5, int(np.random.normal(bptt, 5)))
+ # # There's a very small chance that it could select a very long sequence length resulting in OOM
+ # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)
+ seq_len = int(bptt)
+
+ lr2 = optimizer.param_groups[0]['lr']
+ optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
+ model.train()
+
+ data_valid, targets_valid = get_batch(search_data, i % (search_data.size(0) - 1), args)
+ data, targets = get_batch(train_data, i, args, seq_len=seq_len)
+
+ optimizer.zero_grad()
+
+ start, end, s_id = 0, args.small_batch_size, 0
+ while start < args.batch_size:
+ cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1)
+ cur_data_valid, cur_targets_valid = data_valid[:, start: end], targets_valid[:, start: end].contiguous().view(-1)
+
+ # Starting each batch, we detach the hidden state from how it was previously produced.
+ # If we didn't, the model would try backpropagating all the way to start of the dataset.
+ hidden[s_id] = repackage_hidden(hidden[s_id])
+ hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id])
+
+ hidden_valid[s_id], grad_norm = architect.step(
+ hidden[s_id], cur_data, cur_targets,
+ hidden_valid[s_id], cur_data_valid, cur_targets_valid,
+ optimizer,
+ args.unrolled)
+
+ # assuming small_batch_size = batch_size so we don't accumulate gradients
+ optimizer.zero_grad()
+ hidden[s_id] = repackage_hidden(hidden[s_id])
+
+ log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True)
+ raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)
+
+ loss = raw_loss
+ # Activiation Regularization
+ if args.alpha > 0:
+ loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
+ # Temporal Activation Regularization (slowness)
+ loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
+ loss *= args.small_batch_size / args.batch_size
+ total_loss += raw_loss.data * args.small_batch_size / args.batch_size
+ loss.backward()
+
+ s_id += 1
+ start = end
+ end = start + args.small_batch_size
+
+ gc.collect()
+
+ # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
+ optimizer.step()
+
+ # total_loss += raw_loss.data
+ optimizer.param_groups[0]['lr'] = lr2
+ if batch % args.log_interval == 0 and batch > 0:
+ logging.info(parallel_model.genotype())
+ print(F.softmax(parallel_model.weights, dim=-1))
+ cur_loss = total_loss[0] / args.log_interval
+ elapsed = time.time() - start_time
+ logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
+ 'loss {:5.2f} | ppl {:8.2f}'.format(
+ epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
+ elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+ total_loss = 0
+ start_time = time.time()
+ batch += 1
+ i += seq_len
+
+# Loop over epochs.
+lr = args.lr
+best_val_loss = []
+stored_loss = 100000000
+
+if args.continue_train:
+ optimizer_state = torch.load(os.path.join(args.save, 'optimizer.pt'))
+ if 't0' in optimizer_state['param_groups'][0]:
+ optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
+ else:
+ optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
+ optimizer.load_state_dict(optimizer_state)
+else:
+ optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
+
+for epoch in range(1, args.epochs+1):
+ epoch_start_time = time.time()
+ train()
+
+ val_loss = evaluate(val_data, eval_batch_size)
+ logging.info('-' * 89)
+ logging.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+ 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
+ val_loss, math.exp(val_loss)))
+ logging.info('-' * 89)
+
+ if val_loss < stored_loss:
+ save_checkpoint(model, optimizer, epoch, args.save)
+ logging.info('Saving Normal!')
+ stored_loss = val_loss
+
+ best_val_loss.append(val_loss)
diff --git a/darts_source/rnn/utils.py b/darts_source/rnn/utils.py
new file mode 100644
index 000000000..2d37c7135
--- /dev/null
+++ b/darts_source/rnn/utils.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn as nn
+import os, shutil
+import numpy as np
+from torch.autograd import Variable
+
+
+def repackage_hidden(h):
+ if type(h) == Variable:
+ return Variable(h.data)
+ else:
+ return tuple(repackage_hidden(v) for v in h)
+
+
+def batchify(data, bsz, args):
+ nbatch = data.size(0) // bsz
+ data = data.narrow(0, 0, nbatch * bsz)
+ data = data.view(bsz, -1).t().contiguous()
+ print(data.size())
+ if args.cuda:
+ data = data.cuda()
+ return data
+
+
+def get_batch(source, i, args, seq_len=None, evaluation=False):
+ seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
+ data = Variable(source[i:i+seq_len], volatile=evaluation)
+ target = Variable(source[i+1:i+1+seq_len])
+ return data, target
+
+
+def create_exp_dir(path, scripts_to_save=None):
+ if not os.path.exists(path):
+ os.mkdir(path)
+
+ print('Experiment dir : {}'.format(path))
+ if scripts_to_save is not None:
+ os.mkdir(os.path.join(path, 'scripts'))
+ for script in scripts_to_save:
+ dst_file = os.path.join(path, 'scripts', os.path.basename(script))
+ shutil.copyfile(script, dst_file)
+
+
+def save_checkpoint(model, optimizer, epoch, path, finetune=False):
+ if finetune:
+ torch.save(model, os.path.join(path, 'finetune_model.pt'))
+ torch.save(optimizer.state_dict(), os.path.join(path, 'finetune_optimizer.pt'))
+ else:
+ torch.save(model, os.path.join(path, 'model.pt'))
+ torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer.pt'))
+ torch.save({'epoch': epoch+1}, os.path.join(path, 'misc.pt'))
+
+
+def embedded_dropout(embed, words, dropout=0.1, scale=None):
+ if dropout:
+ mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
+ mask = Variable(mask)
+ masked_embed_weight = mask * embed.weight
+ else:
+ masked_embed_weight = embed.weight
+ if scale:
+ masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight
+
+ padding_idx = embed.padding_idx
+ if padding_idx is None:
+ padding_idx = -1
+ X = embed._backend.Embedding.apply(words, masked_embed_weight,
+ padding_idx, embed.max_norm, embed.norm_type,
+ embed.scale_grad_by_freq, embed.sparse
+ )
+ return X
+
+
+class LockedDropout(nn.Module):
+ def __init__(self):
+ super(LockedDropout, self).__init__()
+
+ def forward(self, x, dropout=0.5):
+ if not self.training or not dropout:
+ return x
+ m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
+ mask = Variable(m.div_(1 - dropout), requires_grad=False)
+ mask = mask.expand_as(x)
+ return mask * x
+
+
+def mask2d(B, D, keep_prob, cuda=True):
+ m = torch.floor(torch.rand(B, D) + keep_prob) / keep_prob
+ m = Variable(m, requires_grad=False)
+ if cuda:
+ m = m.cuda()
+ return m
+
diff --git a/darts_source/rnn/visualize.py b/darts_source/rnn/visualize.py
new file mode 100644
index 000000000..16942d19f
--- /dev/null
+++ b/darts_source/rnn/visualize.py
@@ -0,0 +1,47 @@
+import sys
+import genotypes
+from graphviz import Digraph
+
+
+def plot(genotype, filename):
+ g = Digraph(
+ format='pdf',
+ edge_attr=dict(fontsize='20', fontname="times"),
+ node_attr=dict(style='filled', shape='rect', align='center', fontsize='20', height='0.5', width='0.5', penwidth='2', fontname="times"),
+ engine='dot')
+ g.body.extend(['rankdir=LR'])
+
+ g.node("x_{t}", fillcolor='darkseagreen2')
+ g.node("h_{t-1}", fillcolor='darkseagreen2')
+ g.node("0", fillcolor='lightblue')
+ g.edge("x_{t}", "0", fillcolor="gray")
+ g.edge("h_{t-1}", "0", fillcolor="gray")
+ steps = len(genotype)
+
+ for i in range(1, steps + 1):
+ g.node(str(i), fillcolor='lightblue')
+
+ for i, (op, j) in enumerate(genotype):
+ g.edge(str(j), str(i + 1), label=op, fillcolor="gray")
+
+ g.node("h_{t}", fillcolor='palegoldenrod')
+ for i in range(1, steps + 1):
+ g.edge(str(i), "h_{t}", fillcolor="gray")
+
+ g.render(filename, view=True)
+
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print("usage:\n python {} ARCH_NAME".format(sys.argv[0]))
+ sys.exit(1)
+
+ genotype_name = sys.argv[1]
+ try:
+ genotype = eval('genotypes.{}'.format(genotype_name))
+ except AttributeError:
+ print("{} is not specified in genotypes.py".format(genotype_name))
+ sys.exit(1)
+
+ plot(genotype.recurrent, "recurrent")
+
diff --git a/data/penn/.keep b/data/penn/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/data/wikitext-2/.keep b/data/wikitext-2/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/img/cifar10.png b/img/cifar10.png
deleted file mode 100644
index 5de6eec01..000000000
Binary files a/img/cifar10.png and /dev/null differ
diff --git a/img/darts.png b/img/darts.png
deleted file mode 100644
index 2932242cc..000000000
Binary files a/img/darts.png and /dev/null differ
diff --git a/img/imagenet.png b/img/imagenet.png
deleted file mode 100644
index 693bc5410..000000000
Binary files a/img/imagenet.png and /dev/null differ
diff --git a/img/progress_convolutional.gif b/img/progress_convolutional.gif
deleted file mode 100644
index f288d0b9b..000000000
Binary files a/img/progress_convolutional.gif and /dev/null differ
diff --git a/img/progress_convolutional_normal.gif b/img/progress_convolutional_normal.gif
deleted file mode 100644
index 832d47ccc..000000000
Binary files a/img/progress_convolutional_normal.gif and /dev/null differ
diff --git a/img/progress_convolutional_reduce.gif b/img/progress_convolutional_reduce.gif
deleted file mode 100644
index 7bb478046..000000000
Binary files a/img/progress_convolutional_reduce.gif and /dev/null differ
diff --git a/img/progress_recurrent.gif b/img/progress_recurrent.gif
deleted file mode 100644
index 7633372ba..000000000
Binary files a/img/progress_recurrent.gif and /dev/null differ
diff --git a/img/ptb.png b/img/ptb.png
deleted file mode 100644
index 8b8b4eda7..000000000
Binary files a/img/ptb.png and /dev/null differ
diff --git a/data/imagenet/.keep b/nohup.out
similarity index 100%
rename from data/imagenet/.keep
rename to nohup.out