Skip to content

Commit

Permalink
Add basic operations for UAI Train job control,
Browse files Browse the repository at this point in the history
Including:
Create job
List job
Get job info
Stop job
Delete job

All five AI arch all supported
  • Loading branch information
宋翔 authored and 宋翔 committed Nov 14, 2017
1 parent 3ba8b49 commit 9d4e7a7
Show file tree
Hide file tree
Showing 30 changed files with 1,362 additions and 45 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@
- Tensorflow (0.11.0 tested)
- Tensorflow(1.1.0 tested)
- Tensorflow(1.2.0 tested)
- Tensorflow (1.3.0 tested)
- Tensorflow (1.4.0 tested)
- MXNet(0.9.5 tested)
- MXNet(0.11.0 tested)
- Keras(1.2.0 tested)
- Caffe(1.0.0 tested)
- PyTorch(0.2.0 tested)

## How to install
1. Install your deep learning python package, such as Tensorflow, MXNet, Keras, Caffe (tested version preferred)
Expand All @@ -40,4 +44,4 @@
### UAI Service Docs
https://docs.ucloud.cn/ai/uai-service/use
### UAI Train Docs
https://docs.ucloud.cn/ai/uai-train/use
https://docs.ucloud.cn/ai/uai-train/use
1 change: 1 addition & 0 deletions uai/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import tarfile
import json

GATEWAY_DEFAULT='Default'

def _verfy_ac(private_key, params):
items = params.items()
Expand Down
31 changes: 31 additions & 0 deletions uai/utils/utils_ufs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import re

UFS_MOUNT_POINT_FORMAT = r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\/ufs-\w+'
UFS_PATH_FORMAT = r'(\w+\/)+'

def concat_ufs_path(path, mount_point):
mount_point_pattern = re.compile(UFS_MOUNT_POINT_FORMAT)
path_pattern = re.compile(UFS_PATH_FORMAT)

if mount_point_pattern.match(mount_point) is None:
raise RuntimeError("UFS mount point should be in format x.x.x.x:/ufs-xxx")

if path_pattern.match(path) is None:
raise RuntimeError("UFS path should match xxx/xxx/")

return mount_point + '/' + path
11 changes: 8 additions & 3 deletions uaitrain/api/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import requests
import json

from uai.utils.utils import _verfy_ac
from uai.utils.logger import uai_logger
from uai.utils.retcode_checker import *
Expand All @@ -23,6 +24,10 @@
DEFAULT_UAI_TRAIN_REGION = 'cn-bj2'
DEFAULT_UAI_TRAIN_ZONE = 'cn-bj2-04'

# DEFAULT_UCLOUD_API_URL = 'http://api.pre.ucloudadmin.com'
# DEFAULT_UAI_TRAIN_REGION = 'pre'
# DEFAULT_UAI_TRAIN_ZONE = 'pre'

PARAM_ACTION = 'Action'
PARAM_PUBLIC_KEY = 'PublicKey'
PARAM_PROJECT_ID = 'ProjectId'
Expand Down Expand Up @@ -60,7 +65,7 @@ def _cmd_common_request(self):
self.cmd_params.pop('Signature')
self.cmd_params['Signature'] = _verfy_ac(self.priv_key,
self.cmd_params)

print (self.cmd_params)
uai_logger.info("Call http request: {0} ".format(get_request(self.cmd_url, params=self.cmd_params)))
r = requests.get(self.cmd_url, params=self.cmd_params)
rsp = json.loads(r.text, 'utf-8')
Expand All @@ -70,7 +75,7 @@ def _cmd_common_request(self):
return False, rsp
else:
del rsp[PARAM_ACTION]
uai_logger.info("{0} Success: {1}".format(self.cmd_params[PARAM_ACTION], get_response(rsp, 0)))
#uai_logger.info("{0} Success: {1}".format(self.cmd_params[PARAM_ACTION], get_response(rsp, 0)))
return True, rsp
# add other operations in subclasses#

Expand All @@ -89,5 +94,5 @@ def call_api(self):

return self._cmd_common_request()

def check_errcode():
def check_errcode(self):
pass
94 changes: 94 additions & 0 deletions uaitrain/api/create_train_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from uaitrain.api.base_op import BaseUAITrainAPIOp

class CreateUAITrainJobOp(BaseUAITrainAPIOp):
ACTION_NAME = "CreateUAITrainJob"
"""
CreateUAITrainJobOp
Compatable with UAI Train CreateUAITrainJob API func
Input:
pub_key string(required) Public key of the user
priv_key string(required) Private key of the user
project_id int(optional) Project ID of the job
region string(optional) Which Region to run the job
zone string(optional) Which Zone in the Region to run the job
job_name string(required) Job name of the job
work_id int(required) the id of train node, you can get detail info from GetUAITrainAvailableResourceOp.
1860001, include 1 GPU
1860003, include 4 GPU
etc.
code_uhub_path string(required) Which image in the uhub to run the job
data_ufile_path string(required) the ufile path of input data
out_ufile_path string(required) the ufile path of output data
docker_cmd string(required) the cmd of run the job
max_exec_time int(required) the max exec time of job. if the job don't finish in the time, system will stop the job.
business_group string(optional) Which business group to run the job
job_memo string(optional) the memo of the job
Output:
RetCode int(required) Op return code: 0: success, others: error code
TrainJObID string(required) the id of the train job
Message string(not required) Message: error description
"""

def __init__(self, pub_key, priv_key, job_name, work_id, code_uhub_path, data_ufile_path, out_ufile_path,
docker_cmd, max_exec_time, business_group="", job_memo="", project_id="",
region="", zone=""):
super(CreateUAITrainJobOp, self).__init__(self.ACTION_NAME,
pub_key,
priv_key,
project_id,
region,
zone)
self.cmd_params["TrainJobName"] = job_name
self.cmd_params["TrainWorkId"] = work_id
self.cmd_params["CodeUhubPath"] = code_uhub_path
self.cmd_params["DataUfilePath"] = data_ufile_path
self.cmd_params["OutputUfilePath"] = out_ufile_path
self.cmd_params["DockerCmd"] = docker_cmd
self.cmd_params["PredictStartTime"] = 0
self.cmd_params["MaxExecuteTime"] = max_exec_time

self.cmd_params["TrainPublicKey"] = pub_key
self.cmd_params["TrainPrivateKey"] = priv_key

self.cmd_params["TrainJobMemo"] = job_memo
self.cmd_params["BusinessGroup"] = business_group

def _check_args(self):
super(CreateUAITrainJobOp, self)._check_args()
if self.cmd_params["TrainJobName"] == "" or type(self.cmd_params["TrainJobName"]) != str:
raise RuntimeError("job_name shoud be <str> and is not nil.")

if self.cmd_params["TrainWorkId"] == "" or type(self.cmd_params["TrainWorkId"]) != int:
raise RuntimeError("work_id shoud be <int> and is not nil.")

if self.cmd_params["CodeUhubPath"] == "" or type(self.cmd_params["CodeUhubPath"]) != str:
raise RuntimeError("code_uhub_path shoud be <str> and is not nil.")

if self.cmd_params["DataUfilePath"] == "" or type(self.cmd_params["DataUfilePath"]) != str:
raise RuntimeError("data_ufile_path shoud be <str> and is not nil.")

if self.cmd_params["OutputUfilePath"] == "" or type(self.cmd_params["OutputUfilePath"]) != str:
raise RuntimeError("out_ufile_path shoud be <str> and is not nil.")

if self.cmd_params["DockerCmd"] == "" or type(self.cmd_params["DockerCmd"]) != str:
raise RuntimeError("docker_cmd shoud be <str> and is not nil.")

if self.cmd_params["MaxExecuteTime"] == "" or type(self.cmd_params["MaxExecuteTime"]) != int:
raise RuntimeError("max_exec_time shoud be <int> and is not nil.")
49 changes: 49 additions & 0 deletions uaitrain/api/get_train_available_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from uaitrain.api.base_op import BaseUAITrainAPIOp

class GetUAITrainAvailableResourceOp(BaseUAITrainAPIOp):
ACTION_NAME = "GetUAITrainAvailableResource"
"""
GetUAITrainAvailableResourceOp
Compatable with UAI Train GetUAITrainAvailableResource API func
Input:
pub_key string(required) Public key of the user
priv_key string(required) Private key of the user
project_id int(optional) Project ID of the job
region string(optional) Which Region to run the job
zone string(optional) Which Zone in the Region to run the job
node_type string(optional) the type of node, default is 'Work'.
'Work': train node
'PS': param node
Output:
RetCode int(required) Op return code: 0: success, others: error code
TotalCount string(required) the count of result
Message string(not required) Message: error description
DataSet [] the detailed information of resource
"""

def __init__(self, pub_key, priv_key, node_type='Work', project_id="", region="", zone=""):
super(GetUAITrainAvailableResourceOp, self).__init__(self.ACTION_NAME,
pub_key,
priv_key,
project_id,
region,
zone)
self.cmd_params["NodeType"] = node_type

def _check_args(self):
super(GetUAITrainAvailableResourceOp, self)._check_args()
64 changes: 64 additions & 0 deletions uaitrain/api/get_train_job_bill_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from uaitrain.api.base_op import BaseUAITrainAPIOp

class GetUAITrainBillInfoOp(BaseUAITrainAPIOp):
ACTION_NAME = "GetUAITrainBillInfo"
"""
GetUAITrainBillInfoOp
Compatable with UAI Train GetUAITrainBillInfo API func
Input:
pub_key string(required) Public key of the user
priv_key string(required) Private key of the user
project_id int(optional) Project ID of the job
region string(optional) Which Region to run the job
zone string(optional) Which Zone in the Region to run the job
beg_time string(required) the start time of bill
end_time string(required) the end time of bill
offset int(optional) the offset of list
limit int(optional) the max num of returned list, return all bill list if isn't set
Output:
RetCode int(required) Op return code: 0: success, others: error code
TotalCount string(required) the count of result
TotalExecuteTime int(required) total exec time of all train job
TotalPrice int(required) total price of all train job
Message string(not required) Message: error description
DataSet [] the detailed bill information of train job
"""

def __init__(self, pub_key, priv_key, beg_time, end_time, offset="", limit="", project_id="", region="", zone=""):
super(GetUAITrainBillInfoOp, self).__init__(self.ACTION_NAME,
pub_key,
priv_key,
project_id,
region,
zone)
self.cmd_params["BeginTime"] = beg_time
self.cmd_params["EndTime"] = end_time
self.cmd_params["Offset"] = offset
self.cmd_params["Limit"] = limit

def _check_args(self):
super(GetUAITrainBillInfoOp, self)._check_args()
if self.cmd_params["BeginTime"] == "" or type(self.cmd_params["BeginTime"]) != int:
raise RuntimeError("beg_time shoud be <int> and is not nil.")
if self.cmd_params["EndTime"] == "" or type(self.cmd_params["EndTime"]) != int:
raise RuntimeError("end_time shoud be <int> and is not nil.")

if self.cmd_params["BeginTime"] > self.cmd_params["EndTime"]:
raise RuntimeError("end_time should be greater than beg_time. end_time: {0}, beg_time: {1}".
format(self.cmd_params["EndTime"], self.cmd_params["BeginTime"]))
52 changes: 52 additions & 0 deletions uaitrain/api/get_train_job_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from uaitrain.api.base_op import BaseUAITrainAPIOp

class GetUAITrainJobListOp(BaseUAITrainAPIOp):
ACTION_NAME = "GetUAITrainJobList"
"""
GetUAITrainJobListOp
Compatable with UAI Train GetUAITrainJobList API func
Input:
pub_key string(required) Public key of the user
priv_key string(required) Private key of the user
project_id int(optional) Project ID of the job
region string(optional) Which Region to run the job
zone string(optional) Which Zone in the Region to run the job
job_id string(optional) Which train job to get info
offset int(optional) the offset of list
limit int(optional) the max num of returned list, return all job list if isn't set
Output:
RetCode int(required) Op return code: 0: success, others: error code
TotalCount string(required) the count of result
Message string(not required) Message: error description
DataSet [] the detailed information of train job
"""

def __init__(self, pub_key, priv_key, job_id="", offset="", limit="", project_id="", region="", zone=""):
super(GetUAITrainJobListOp, self).__init__(self.ACTION_NAME,
pub_key,
priv_key,
project_id,
region,
zone)
self.cmd_params["TrainJobId"] = job_id
self.cmd_params["Offset"] = offset
self.cmd_params["Limit"] = limit

def _check_args(self):
super(GetUAITrainJobListOp, self)._check_args()
Loading

0 comments on commit 9d4e7a7

Please sign in to comment.