diff --git a/README.md b/README.md index 73c630e..5830b5b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ A minimum inference engine for DiffSinger MIDI-less mode. ## Getting Started -1. Install `onnxruntime` following the [official guidance](https://onnxruntime.ai/). +1. Install `onnxruntime` following the [official guidance](https://onnxruntime.ai/). `pip install onnxruntime-gpu` 2. Install other dependencies with `pip install PyYAML soundfile`. 3. Download ONNX version of the NSF-HiFiGAN vocoder from [here](https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1) and unzip it into `assets/vocoder` directory. 4. Download an ONNX rhythm predictor from [here](https://github.com/openvpi/DiffSinger/releases/tag/v1.4.1) and put it into `assets/rhythmizer` directory. @@ -12,10 +12,103 @@ A minimum inference engine for DiffSinger MIDI-less mode. 7. Run server with `python server.py` or `python server.py --config `. ## API Specification +* 版本信息 -TBD +``` +GET /version HTTP/1.1 + +HTTP/1.1 200 OK +{"version": "1.0.1", "date": "2023-01-08"} +``` + +* 模型列表 +``` +GET /models HTTP/1.1 + +HTTP/1.1 200 OK +Content-Type:application/json +{"models": ["1215_opencpop_ds1000_fix_label_nomidi"]} +``` +* 生成节奏 +``` +POST /rhythm HTTP/1.1 +Content-Type:application/json +{ + "notes":[ + {"key": 0,"duration": 0.5,"slur": false,"phonemes": ["SP"]}, + {"key": 69,"duration": 0.5,"slur": false,"phonemes": ["sh","a"]}, + {"key": 71,"duration": 1.0,"slur": true} + ] +} + +HTTP/1.1 200 OK +Content-Type:application/json +{"phonemes":[ + {"name": "SP", "duration": 0.235995352268219}, + {"name": "sh", "duration": 0.264004647731781}, + {"name": "a", "duration": 1.5} +]} +``` + +* 提交 +``` +POST /submit HTTP/1.1 +Content-Type:application/json +{ + "model": "1215_opencpop_ds1000_fix_label_nomidi", + "phonemes":[ + {"name": "SP", "duration": 0.235995352268219}, + {"name": "sh", "duration": 0.264004647731781}, + {"name": "a", "duration": 1.5} + ], + "f0":{ + "timestep": 0.01, + "values": [440.0,440.0,440.0,440.0,440.0] + }, + "speedup": 50 +} + +HTTP/1.1 200 OK +Content-Type:application/json +{ + "token": "afbc3057747f0cd98b67f01038855380", + "status": "SUBMITTED", + "code": "ae67" +} +``` +* 查询 +``` +POST /query HTTP/1.1 +Content-Type:application/json +{"token": "afbc3057747f0cd98b67f01038855380"} + +HTTP/1.1 200 OK +Content-Type:application/json +{"status": "HIT_CACHE"} +``` + +* 取消任务 +``` +POST /cancel HTTP/1.1 +Content-Type:application/json +{"token": "afbc3057747f0cd98b67f01038855380","code":"ae67"} +{"succeeded": false,"message": "Task result already in cache."} +``` +* 下载文件 +``` +GET /download?token=afbc3057747f0cd98b67f01038855380 HTTP/1.1 + +HTTP/1.1 200 ok +content-type: audio/wav +``` ## How to Obtain Acoustic Models 1. [Train with your own dataset](https://github.com/openvpi/DiffSinger/blob/refactor/pipelines/no_midi_preparation.ipynb) or download pretrained checkpoints from [here](https://github.com/openvpi/DiffSinger/releases/tag/v1.4.0). 2. Export PyTorch checkpoints to ONNX format. See instructions [here](https://github.com/openvpi/DiffSinger/blob/refactor/docs/README-SVS-onnx.md). + +## 声明: + 请确保你制作数据集的数据来源合法合规,且数据提供者明确你在制作什么以及可能造成的后果 + 该项目为歌声合成项目,无法进行其他用途,请知晓 + 本项目数据集来源:[Opencpop](https://wenet.org.cn/opencpop/liscense/) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f73d389 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +onnxruntime-gpu +PyYAML +soundfile +librosa +httpx diff --git a/server.py b/server.py index d458775..8dbbda8 100644 --- a/server.py +++ b/server.py @@ -23,7 +23,13 @@ format="%(asctime)s - %(levelname)-7s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S") +''' +GET /version HTTP/1.1 +HTTP/1.1 200 OK +Content-Type:application/json +{"version": "1.0.1", "date": "2023-01-08"} +''' def version(request: BaseHTTPRequestHandler): v = { 'version': VERSION, @@ -34,7 +40,13 @@ def version(request: BaseHTTPRequestHandler): request.end_headers() request.wfile.write(json.dumps(v).encode('utf8')) +''' +GET /models HTTP/1.1 +HTTP/1.1 200 OK +Content-Type:application/json +{"models": ["1215_opencpop_ds1000_fix_label_nomidi"]} +''' def models(request: BaseHTTPRequestHandler): res = { 'models': [os.path.basename(file)[:-5] for file in glob.glob(os.path.join(ACOUSTIC_ROOT, '*.onnx'))] @@ -45,81 +57,86 @@ def models(request: BaseHTTPRequestHandler): request.wfile.write(json.dumps(res).encode('utf8')) -def rhythm(request: BaseHTTPRequestHandler): - """ - Example: - { - "notes": [ - { - "key": 0, - "duration": 0.5, - "slur": false, - "phonemes": [ - "SP" - ] - }, - { - "key": 69, - "duration": 0.5, - "slur": false, - "phonemes": [ - "sh", - "a" - ] - }, - { - "key": 71, - "duration": 1.0, - "slur": true - } - ] - } - """ - request_body = json.loads(request.rfile.read(int(request.headers['Content-Length']))) - ph_seq, ph_dur = synthesis.predict_rhythm(request_body['notes'], phoneme_list, vowels, config) - res = { - 'phonemes': [ - { - 'name': name, - 'duration': duration - } - for name, duration in zip(ph_seq, ph_dur) - ] - } +def getdict(request: BaseHTTPRequestHandler): + res = dictionary request.send_response(200) request.send_header('Content-Type', 'application/json') request.end_headers() request.wfile.write(json.dumps(res).encode('utf8')) +""" +POST /rhythm HTTP/1.1 +Content-Type:application/json +{ + "notes":[ + {"key": 0,"duration": 0.5,"slur": false,"phonemes": ["SP"]}, + {"key": 69,"duration": 0.5,"slur": false,"phonemes": ["sh","a"]}, + {"key": 71,"duration": 1.0,"slur": true} + ] +} -def submit(request: BaseHTTPRequestHandler): - """ - Example: - { - "model": "1215_opencpop_ds1000_fix_label_nomidi", - "phonemes": [ - { - "name": "SP", - "duration": 0.5 - }, - { - "name": "SP", - "duration": 0.5 - } - ], - "f0": { - "timestep": 0.01, - "values": [ - 440.0, - 440.0, - 440.0, - 440.0, - 440.0 +HTTP/1.1 200 OK +Content-Type:application/json +{"phonemes":[ + {"name": "SP", "duration": 0.235995352268219}, + {"name": "sh", "duration": 0.264004647731781}, + {"name": "a", "duration": 1.5} +]} +""" +def rhythm(request: BaseHTTPRequestHandler): + try: + request_body = json.loads(request.rfile.read(int(request.headers['Content-Length']))) + ph_seq, ph_dur = synthesis.predict_rhythm(request_body['notes'], phoneme_list, vowels, config) + res = { + 'phonemes': [ + { + 'name': name, + 'duration': duration + } + for name, duration in zip(ph_seq, ph_dur) ] - }, - "speedup": 50 } - """ + request.send_response(200) + request.send_header('Content-Type', 'application/json') + request.end_headers() + request.wfile.write(json.dumps(res).encode('utf8')) + except Exception as e: + res = { + 'error': str(repr(e)) + } + request.send_response(400) + request.send_header('Content-Type', 'application/json') + request.end_headers() + request.wfile.write(json.dumps(res).encode('utf8')) + raise e + + +""" +POST /submit HTTP/1.1 +Content-Type:application/json +{ + "model": "1215_opencpop_ds1000_fix_label_nomidi", + "phonemes":[ + {"name": "SP", "duration": 0.235995352268219}, + {"name": "sh", "duration": 0.264004647731781}, {"name": "a", "duration": 1.5} + ], + "f0":{ + "timestep": 0.01, + "values": [440.0,440.0,440.0,440.0,440.0] + }, + "speedup": 50 +} + +HTTP/1.1 200 OK +Content-Type:application/json +{ + "token": "afbc3057747f0cd98b67f01038855380", + "status": "SUBMITTED", + "code": "ae67" +} +""" +def submit(request: BaseHTTPRequestHandler): + try: request_body = json.loads(request.rfile.read(int(request.headers['Content-Length']))) if 'speedup' not in request_body: request_body['speedup'] = config['acoustic']['speedup'] @@ -150,11 +167,30 @@ def submit(request: BaseHTTPRequestHandler): request.send_header('Content-Type', 'application/json') request.end_headers() request.wfile.write(json.dumps(res).encode('utf8')) + except Exception as e: + res = { + 'error': str(repr(e)) + } + request.send_response(400) + request.send_header('Content-Type', 'application/json') + request.end_headers() + request.wfile.write(json.dumps(res).encode('utf8')) + raise e +''' +POST /query HTTP/1.1 +Content-Type:application/json +{"token": "afbc3057747f0cd98b67f01038855380"} +HTTP/1.1 200 OK +Content-Type:application/json +{"status": "HIT_CACHE"} +''' def query(request: BaseHTTPRequestHandler): + try: request_body = json.loads(request.rfile.read(int(request.headers['Content-Length']))) token = request_body['token'] + cache_file = os.path.join(cache, f'{token}.wav') if os.path.exists(cache_file): res = { @@ -197,9 +233,23 @@ def query(request: BaseHTTPRequestHandler): else: request.send_error(404) mutex.release() - - + except Exception as e: + res = { + 'error': str(repr(e)) + } + request.send_response(400) + request.send_header('Content-Type', 'application/json') + request.end_headers() + request.wfile.write(json.dumps(res).encode('utf8')) + raise e +''' +POST /cancel HTTP/1.1 +Content-Type:application/json +{"token": "afbc3057747f0cd98b67f01038855380","code":"ae67"} +{"succeeded": false,"message": "Task result already in cache."} +''' def cancel(request: BaseHTTPRequestHandler): + try: request_body = json.loads(request.rfile.read(int(request.headers['Content-Length']))) token = request_body['token'] code = request_body['code'] @@ -228,9 +278,23 @@ def cancel(request: BaseHTTPRequestHandler): request.send_header('Content-Type', 'application/json') request.end_headers() request.wfile.write(json.dumps(res).encode('utf8')) + except Exception as e: + res = { + 'error': str(repr(e)) + } + request.send_response(400) + request.send_header('Content-Type', 'application/json') + request.end_headers() + request.wfile.write(json.dumps(res).encode('utf8')) + raise e +''' +GET /download?token=afbc3057747f0cd98b67f01038855380 HTTP/1.1 - +HTTP/1.1 200 ok +content-type: audio/wav +''' def download(request: BaseHTTPRequestHandler): + try: params = dict(urllib.parse.parse_qsl(urllib.parse.urlsplit(request.path).query)) token = params['token'] cache_file = os.path.join(cache, f'{token}.wav') @@ -242,7 +306,15 @@ def download(request: BaseHTTPRequestHandler): request.wfile.write(f.read()) else: request.send_response(404) - + except Exception as e: + res = { + 'error': str(repr(e)) + } + request.send_response(400) + request.send_header('Content-Type', 'application/json') + request.end_headers() + request.wfile.write(json.dumps(res).encode('utf8')) + raise e def _execute(request: dict, cache_file: str, token: str): logging.info(f'Task \'{token}\' begins') @@ -282,6 +354,7 @@ def _execute(request: dict, cache_file: str, token: str): apis = { '/version': (version, ['GET']), '/models': (models, ['GET']), + '/getdict': (getdict, ['GET']), '/rhythm': (rhythm, ['POST']), '/submit': (submit, ['POST']), '/query': (query, ['POST']), @@ -289,7 +362,7 @@ def _execute(request: dict, cache_file: str, token: str): '/download': (download, ['GET']) } mutex = threading.Lock() - + class Request(BaseHTTPRequestHandler): def do_GET(self): diff --git a/synthesis.py b/synthesis.py index 938adcf..f52c059 100644 --- a/synthesis.py +++ b/synthesis.py @@ -118,12 +118,16 @@ def acoustic_preprocess(name2token: list, def acoustic_infer(model: str, providers: list, tokens, durations, f0, speedup): session = utils.create_session(model, providers) mel = session.run(['mel'], {'tokens': tokens, 'durations': durations, 'f0': f0, 'speedup': speedup})[0] + session.end_profiling() + del(session) return mel def vocoder_infer(model: str, providers: list, mel, f0, force_on_cpu=True): session = utils.create_session(model, providers, force_on_cpu=force_on_cpu) waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0] + session.end_profiling() + del(session) return waveform diff --git a/utils.py b/utils.py index 65b1c6b..62befb1 100644 --- a/utils.py +++ b/utils.py @@ -62,7 +62,6 @@ def create_session(model_path: str, providers: list, force_on_cpu: bool = False) # Create inference session session = ort.InferenceSession(path_or_bytes=model_path, sess_options=options, providers=providers) - return session