Merge pull request #77 from HDI-Project/bcyphers/circleci

Add CircleCI compatibility and tests
HDI-Project · Feb 2, 2018 · 7bb859a · 7bb859a
2 parents c3f2ede + 814d0de
commit 7bb859a
Show file tree

Hide file tree

Showing 64 changed files with 1,179 additions and 407 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,5 +1,5 @@
-# This is heavily based on the featuretools test setup. Most of the logic is
-# pushed to pytest and tox.
+# This is heavily based on the featuretools test setup. Most of the setup logic
+# is punted to pytest and tox.
 # 
 # See https://github.com/Featuretools/featuretools
 #
@@ -8,9 +8,16 @@ jobs:
   build:
     working_directory: ~/atm
     docker:
-        - image: themattrix/tox
+      - image: themattrix/tox
+      - image: mysql:5.7
+        environment:
+            MYSQL_USER: ubuntu
+            MYSQL_ROOT_PASSWORD: ubuntu
+            MYSQL_DATABASE: atm
     steps:
       - checkout
+      - run: apt-get -qq update
+      - run: apt-get -qq -y install git mysql-client libmysqlclient-dev
       - run: pyenv local 2.7.13 # 3.5.2 3.6.0
       - run: make installdeps
       - run: make lint && tox && codecov
diff --git a/Makefile b/Makefile
@@ -4,6 +4,7 @@ clean:
 	find . -name '*.pyc' -delete
 	find . -name __pycache__ -delete
 	find . -name '*~' -delete
+	find . -name '*.egg-info' -delete
 
 lint:
 	flake8 atm && isort --check-only --recursive atm
@@ -12,7 +13,8 @@ test: lint
 	python $(TEST_CMD)
 
 installdeps:
+	ssh-keyscan -H github.com > /etc/ssh/ssh_known_hosts
 	pip install --upgrade pip
-	pip install -e .
-	pip install -r dev-requirements.txt
+	pip install -e . --process-dependency-links --quiet
+	pip install -r requirements-dev.txt --quiet
 
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ Below we will give a quick tutorial of how to run atm on your desktop. We will u
    ```
    $ python atm/enter_data.py
    ```
-   This command will create a ``datarun``. In ATM, a *datarun* is a single logical machine learning task. If you run the above command without any arguments, it will use the default settings found in the `config/templates/\*_config.yaml` files to create a new SQLite3 database at `./atm.db`, create a new `dataset` instance which refers to the data above, and create a `datarun` instance which points to that dataset. More about what is stored in this database and what is it used for can be found [here](https://cyphe.rs/static/atm.pdf).
+   This command will create a ``datarun``. In ATM, a *datarun* is a single logical machine learning task. If you run the above command without any arguments, it will use the default settings found in the `atm/config/templates/\*.yaml` files to create a new SQLite3 database at `./atm.db`, create a new `dataset` instance which refers to the data above, and create a `datarun` instance which points to that dataset. More about what is stored in this database and what is it used for can be found [here](https://cyphe.rs/static/atm.pdf).
 
    The command should produce a lot of output, the end of which looks something like this:
 
@@ -119,7 +119,7 @@ AND that's it! You can break out of the worker with Ctrl+C and restart it with t
 
 ## Customizing ATM's configuration and using your own data
 
-ATM's default configuration is fully controlled by the yaml files in ``conig/templates/``. Our documentation will cover the configuration in more detail, but this section provides a brief overview of how to specify the most important values.
+ATM's default configuration is fully controlled by the yaml files in ``atm/conig/templates/``. Our documentation will cover the configuration in more detail, but this section provides a brief overview of how to specify the most important values.
 
 ### Running ATM on your own data
 If you want to use the system for your own dataset, convert your data to a csv file similar to the example shown above. The format is:
@@ -141,15 +141,16 @@ That means there are two ways to pass configuration to the command.
 
    Saving configuration as YAML files is an easy way to save complicated setups or share them with team members. 
 
-   You should start with the templates provided in `config/templates` and modify them to suit your own needs.
+   You should start with the templates provided in `atm/config/templates` and modify them to suit your own needs.
    ```
-   $ cp config/templates/*.yaml config/
+   $ mkdir config
+   $ cp atm/config/templates/*.yaml config/
    $ vim config/*.yaml
    ```
 
-   `run_config.yaml` contains all the settings for a single Dataset and Datarun.  Specify the `train_path` to point to your own dataset.
+   `run.yaml` contains all the settings for a single Dataset and Datarun.  Specify the `train_path` to point to your own dataset.
 
-   `sql_config.yaml` contains the settings for the ModelHub SQL database. The default configuration will connect to (and create if necessary) a SQLite database at `./atm.db` relative to the directory from which `enter_data.py` is run. If you are using a MySQL database, you will need to change the file to something like this: 
+   `sql.yaml` contains the settings for the ModelHub SQL database. The default configuration will connect to (and create if necessary) a SQLite database at `./atm.db` relative to the directory from which `enter_data.py` is run. If you are using a MySQL database, you will need to change the file to something like this: 
    ```
    dialect: mysql
    database: atm
@@ -160,13 +161,13 @@ That means there are two ways to pass configuration to the command.
    query:
    ```
 
-   `aws_config.yaml` should contain the settings for running ATM in the cloud.  This is not necessary for local operation.
+   `aws.yaml` should contain the settings for running ATM in the cloud.  This is not necessary for local operation.
 
   Once your YAML files have been updated, run the datarun creation script and pass it the paths to your new config files:
    ```
-   $ python atm/enter_data.py --sql-config config/sql_config.yaml \
-   > --aws-config config/aws_config.yaml \
-   > --run-config config/run_config.yaml
+   $ python atm/enter_data.py --sql-config config/sql.yaml \
+   > --aws-config config/aws.yaml \
+   > --run-config config/run.yaml
    ```
 
 2. **Using command line arguments**
@@ -183,8 +184,8 @@ That means there are two ways to pass configuration to the command.
 
 Once you've created your custom datarun, start a worker, specifying your config files and the datarun(s) you'd like to compute on.
 ```
-$ python atm/worker.py --sql-config config/sql_config.yaml \
-> --aws-config config/aws_config.yaml --dataruns 1
+$ python atm/worker.py --sql-config config/sql.yaml \
+> --aws-config config/aws.yaml --dataruns 1
 ```
 
 It's important that the SQL configuration used by the worker matches the configuration you passed to `enter_data.py` -- otherwise, the worker will be looking in the wrong ModelHub database for its datarun!

diff --git a/atm/__init__.py b/atm/__init__.py
@@ -6,6 +6,7 @@
 
 # Get the path of the project root, so that the rest of the project can
 # reference files relative to there.
-PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
 
-from . import config, constants, database, enter_data, method, metrics, model, utilities, worker
+__all__ = ['config', 'constants', 'database', 'enter_data', 'method', 'metrics',
+           'model', 'utilities', 'worker']
diff --git a/atm/config.py b/atm/config.py
@@ -1,9 +1,12 @@
-import re
+from __future__ import absolute_import
+
 import os
-import yaml
+import re
 from argparse import ArgumentError, ArgumentTypeError, RawTextHelpFormatter
 
-from atm.constants import *
+import yaml
+
+from .constants import *
 
 
 class Config(object):
@@ -110,7 +113,7 @@ class RunConfig(Config):
     ]
 
     DEFAULTS = {
-        'train_path': 'data/test/pollution_1.csv',
+        'train_path': os.path.join(DATA_TEST_PATH, 'pollution_1.csv'),
         'class_column': 'class',
         'methods': ['logreg', 'dt', 'knn'],
         'priority': 1,
@@ -192,14 +195,14 @@ def add_arguments_aws_ec2(parser):
         pass
 
     # AWS EC2 configurations
-    parser.add-argument('--num-instances', help='Number of EC2 instances to start')
-    parser.add-argument('--num-workers-per-instance', help='Number of ATM workers per instances')
-    parser.add-argument('--ec2-region', help='Region to start instances in')
-    parser.add-argument('--ec2-ami', help='Name of ATM AMI')
-    parser.add-argument('--ec2-key-pair', help='AWS key pair to use for EC2 instances')
-    parser.add-argument('--ec2-keyfile', help='Local path to key file (must match ec2-key-pair)')
-    parser.add-argument('--ec2-instance-type', help='Type of EC2 instance to start')
-    parser.add-argument('--ec2-username', help='Username to log into EC2 instance')
+    parser.add_argument('--num-instances', help='Number of EC2 instances to start')
+    parser.add_argument('--num-workers-per-instance', help='Number of ATM workers per instances')
+    parser.add_argument('--ec2-region', help='Region to start instances in')
+    parser.add_argument('--ec2-ami', help='Name of ATM AMI')
+    parser.add_argument('--ec2-key-pair', help='AWS key pair to use for EC2 instances')
+    parser.add_argument('--ec2-keyfile', help='Local path to key file (must match ec2-key-pair)')
+    parser.add_argument('--ec2-instance-type', help='Type of EC2 instance to start')
+    parser.add_argument('--ec2-username', help='Username to log into EC2 instance')
 
     return parser
 
@@ -327,7 +330,7 @@ def add_arguments_datarun(parser):
                         'performance on a test dataset, and "mu_sigma" will use '
                         'the lower confidence bound on the CV performance.')
 
-    ##  AutoML Arguments #######################################################
+    ##  AutoML Arguments  ######################################################
     ############################################################################
     # hyperparameter selection strategy
     # How should ATM sample hyperparameters from a given hyperpartition?
@@ -370,7 +373,7 @@ def add_arguments_datarun(parser):
     #    # train using sample criteria
     #  else
     #    # train using uniform (baseline)
-    parser.add_argument('--r-minimum',  type=int,
+    parser.add_argument('--r-minimum', type=int,
                         help='number of random runs to perform before tuning can occur')
 
     # k is number that xxx-k methods use. It is similar to r_minimum, except it is

diff --git a/config/templates/aws_config.yaml → atm/config/templates/aws.yaml b/config/templates/aws_config.yaml → atm/config/templates/aws.yaml
diff --git a/config/templates/run_config.yaml → atm/config/templates/run.yaml b/config/templates/run_config.yaml → atm/config/templates/run.yaml
@@ -1,5 +1,5 @@
 ## Dataset arguments
-train_path: data/test/pollution_1.csv
+train_path: atm/data/test/pollution_1.csv
 # if test_path is not supplied, train_path is assumed to point to train/test data
 test_path: 
 data_description: "Example dataset description"
@@ -27,9 +27,9 @@ gridding: 0
 # How should ATM select a particular hyperpartition (frozen set) from the 
 # set of all hyperpartitions? 
 selector: bestk
-# k is number that xxx_k methods use. It is similar to r_minimum, except it is 
-# called k_window and determines how much "history" ATM considers for certain
-# frozen selection logics.
+# k is number that xxx_k methods use. It is similar to r_minimum, except it
+# determines how much "history" ATM considers for certain hyperpartition
+# selection logic.
 k_window: 5
 # Which field to use for judgment of performance
 # options: f1, roc_auc, accuracy

diff --git a/config/templates/sql_config.yaml → atm/config/templates/sql.yaml b/config/templates/sql_config.yaml → atm/config/templates/sql.yaml
diff --git a/config/test/end_to_end/run.yaml → atm/config/test/run-all.yaml b/config/test/end_to_end/run.yaml → atm/config/test/run-all.yaml
diff --git a/config/test/btb/run.yaml → atm/config/test/run-basic.yaml b/config/test/btb/run.yaml → atm/config/test/run-basic.yaml
diff --git a/config/test/end_to_end/sql.yaml → atm/config/test/sql-mysql.yaml b/config/test/end_to_end/sql.yaml → atm/config/test/sql-mysql.yaml
@@ -1,14 +1,14 @@
 # SQL dialect
-dialect: sqlite
+dialect: mysql
 # Name of the database
-database: test/atm.db
+database: atm
 # Username to gain access to the database
-username: 
+username: ubuntu
 # Password to gain access to the database
-password:
+password: ubuntu
 # Host name of the device hosting the database
-host: 
+host: 127.0.0.1
 # Port on host listening for database connections
-port:
+port: 3306
 # Optional field for specifying login details
 query: 
diff --git a/config/test/btb/sql.yaml → atm/config/test/sql-sqlite.yaml b/config/test/btb/sql.yaml → atm/config/test/sql-sqlite.yaml
diff --git a/atm/constants.py b/atm/constants.py
@@ -1,12 +1,15 @@
+from __future__ import absolute_import
+
 import os
-from atm import PROJECT_ROOT
-# sample tuners
-from btb.tuning import Uniform as UniformTuner, GP, GPEi, GPEiVelocity
-# hyperpartition selectors
-from btb.selection import Uniform as UniformSelector, UCB1,\
-                          BestKReward, BestKVelocity, RecentKReward,\
-                          RecentKVelocity, HierarchicalByAlgorithm,\
-                          PureBestKVelocity
+
+from . import PROJECT_ROOT
+
+from btb.selection import Uniform as UniformSelector
+from btb.selection import (UCB1, BestKReward, BestKVelocity,
+                           HierarchicalByAlgorithm, PureBestKVelocity,
+                           RecentKReward, RecentKVelocity)
+from btb.tuning import Uniform as UniformTuner
+from btb.tuning import GP, GPEi, GPEiVelocity
 
 # A bunch of constants which are used throughout the project, mostly for config.
 # TODO: convert these lists and classes to something more elegant, like enums
@@ -26,13 +29,15 @@
 HTTP_PREFIX = '^https?://'
 
 TIME_FMT = '%Y-%m-%d %H:%M'
+DATA_TEST_PATH = os.path.join(PROJECT_ROOT, 'data/test')
 DATA_DL_PATH = os.path.join(PROJECT_ROOT, 'data/downloads')
 METHOD_PATH = os.path.join(PROJECT_ROOT, 'methods')
-LOG_PATH = os.path.join(PROJECT_ROOT, 'logs')
 
 CUSTOM_CLASS_REGEX = '(.*\.py):(\w+)$'
 JSON_REGEX = '(.*\.json)$'
 
+N_FOLDS_DEFAULT = 10
+
 TUNERS_MAP = {
     'uniform': UniformTuner,
     'gp': GP,
@@ -68,26 +73,31 @@
     'ada': 'adaboost.json'
 }
 
+
 class ClassifierStatus:
     RUNNING = 'running'
     ERRORED = 'errored'
     COMPLETE = 'complete'
 
+
 class RunStatus:
     PENDING = 'pending'
     RUNNING = 'running'
     COMPLETE = 'complete'
 
+
 class PartitionStatus:
     INCOMPLETE = 'incomplete'
     GRIDDING_DONE = 'gridding_done'
     ERRORED = 'errored'
 
+
 class FileType:
     LOCAL = 'local'
     S3 = 's3'
     HTTP = 'http'
 
+
 # these are the strings that are used to index into results dictionaries
 class Metrics:
     ACCURACY = 'accuracy'
@@ -104,6 +114,7 @@ class Metrics:
     PR_CURVE = 'pr_curve'
     ROC_CURVE = 'roc_curve'
 
+
 METRICS_BINARY = [
     Metrics.ACCURACY,
     Metrics.COHEN_KAPPA,
@@ -124,5 +135,3 @@ class Metrics:
 ]
 
 METRICS = list(set(METRICS_BINARY + METRICS_MULTICLASS))
-
-N_FOLDS_DEFAULT = 10