From c80ad33a5010a73cf9d0e5ddb3f858a54cf256b5 Mon Sep 17 00:00:00 2001 From: blurryroots Date: Thu, 7 Dec 2023 15:42:45 +0100 Subject: [PATCH] best effort to compile legal information added 'show legal info' flag '-g' to list command download, parse and store model-specific legal info (model card) moved assemble_download_info into db module changed fetch function to download raw data, then added json wrapper added model card parser function to db --- src/piper_whistle/__init__.py | 2 +- src/piper_whistle/cli.py | 5 + src/piper_whistle/cmds.py | 122 ++++-------------- src/piper_whistle/db.py | 235 ++++++++++++++++++++++++++++++---- 4 files changed, 242 insertions(+), 122 deletions(-) diff --git a/src/piper_whistle/__init__.py b/src/piper_whistle/__init__.py index 36e9363..c529ee8 100644 --- a/src/piper_whistle/__init__.py +++ b/src/piper_whistle/__init__.py @@ -4,7 +4,7 @@ def version (): - return "1.6.28" + return "1.6.33" if '__main__' == __name__: diff --git a/src/piper_whistle/cli.py b/src/piper_whistle/cli.py index 4326406..45f29b3 100644 --- a/src/piper_whistle/cli.py +++ b/src/piper_whistle/cli.py @@ -128,6 +128,11 @@ def create_arg_parser (): , help='List available languages.' , default=False ) + list_args.add_argument ('-g', '--legal' + , action='store_true' + , help='Show avaiable legal information.' + , default=False + ) list_args.add_argument ('-p', '--install-path' , action='store_true' , help='List path of voice (if installed).' diff --git a/src/piper_whistle/cmds.py b/src/piper_whistle/cmds.py index a257617..f3c6618 100644 --- a/src/piper_whistle/cmds.py +++ b/src/piper_whistle/cmds.py @@ -63,99 +63,6 @@ def _parse_voice_selector (selector): return name, quality, speaker -def _assemble_download_info (context, code, voice_i): - """! Compile details used to download voice data. - - The assembled information may look like this: - { - 'langugage': en_GB, - 'model': { - 'url': "https://...", - 'size': 777, - 'md5': some md5 hash - }, - 'config': { - 'url': "https://...", - 'size': 777, - 'md5': some md5 hash - }, - 'samples': [ - list of URLs to a sample voice reading for each speaker - ], - 'local_path_relative': /some/local/path, - 'selection_name': selector name, - } - - @param context Context information and whistle database. - @param code Language code of voice to be downloaded. - @param voice_i Voice index to be downloaded. - @return Returns a map containing download information. - """ - index = context['db']['index'] - langdb = context['db']['languages'] - # For example: "https://huggingface.co/rhasspy/piper-voices/resolve/main" - base_url = db.remote_repo_build_branch_root (context['repo']) - - # Check if given code is available and collect meta info for later - # download and storage. - if code in langdb: - lang = langdb[code] - voices = lang["voices"] - # Select specific voice by index. - if -1 < voice_i and voice_i < len (voices): - voice_name = voices[voice_i] - holz.info (f'Requesting "{voice_name}" ...') - voice_details = index[voice_name] - - download_info = { - 'langugage': code, - 'model': None, - 'config': None, - 'samples': [], - 'local_path_relative': f'{code}/{voice_name}', - 'selection_name': f"{code}:{voice_details['name']}@{voice_details['quality']}" - } - - # Identify onnx speech model files. - for file in voice_details['files']: - if file.endswith ('.onnx'): - download_info['model'] = { - 'url': f'{base_url}/{file}', - 'size': voice_details['files'][file]['size_bytes'], - 'md5': voice_details['files'][file]['md5_digest'] - } - elif file.endswith ('.onnx.json'): - download_info['config'] = { - 'url': f'{base_url}/{file}', - 'size': voice_details['files'][file]['size_bytes'], - 'md5': voice_details['files'][file]['md5_digest'] - } - - voice_base_url = os.path.dirname (download_info['model']['url']) - - def build_sample_url (base, speaker_name, speaker_id, ext = 'mp3'): - return f'{base}/samples/{speaker_name}_{speaker_id}.{ext}' - - # samples are based on speakers. - # there is always a speaker 0 by default. - if 1 >= int (voice_details['num_speakers']): - speaker_url = build_sample_url (voice_base_url, 'speaker', 0) - download_info['samples'].append (speaker_url) - else: - for speaker_name in voice_details['speaker_id_map']: - speaker_id = int(voice_details['speaker_id_map'][speaker_name]) - speaker_url = build_sample_url (voice_base_url, 'speaker', speaker_id) - download_info['samples'].append (speaker_url) - - return download_info - else: - holz.error (f'Invalid voice index!') - else: - holz.error (f'Cannot recognize: "{code}"') - - return None - - def run_guess (context, args): """! Run command 'guess' @param context Context information and whistle database. @@ -254,6 +161,10 @@ def run_list (context, args): sys.stdout.write (f"\t{model['code']}:{model['name']}@{model['quality']}") if args.verbose: sys.stdout.write (f"\t{model['path']}") + if args.legal: + lgl = context['db']['legal'][model['name']] + a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}" + sys.stdout.write (f"\t{a}") sys.stdout.write ("\n") return 0 @@ -265,7 +176,12 @@ def run_list (context, args): sys.stdout.write (f'Voices for "{code}":\n') voice_i = 0 for voice_name in lang['voices']: - sys.stdout.write (f"\t{voice_i}: {voice_name}\n") + details = voice_name + if args.legal: + lgl = context['db']['legal'][voice_name] + a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}" + details = f"{details} ({a})" + sys.stdout.write (f"\t{voice_i}: {details}\n") voice_i = voice_i + 1 @@ -283,7 +199,12 @@ def run_list (context, args): if -1 < voice_i: voice_name = lang['voices'][voice_i] - sys.stdout.write (f'{voice_name}\t{voice_i}\n') + sys.stdout.write (f'{voice_name}\t{voice_i}') + if args.legal: + lgl = context['db']['legal'][voice_name] + a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}" + sys.stdout.write (f'\t{a}') + sys.stdout.write ('\n') speakers = index[voice_name]['speaker_id_map'] sys.stdout.write (f'Speakers:\n') if 0 == len (speakers): @@ -295,7 +216,12 @@ def run_list (context, args): sys.stdout.write (f'Available voices ({code}):\n') voice_i = 0 for voice_name in lang['voices']: - sys.stdout.write (f"\t{voice_i}: {voice_name}\n") + sys.stdout.write (f"\t{voice_i}: {voice_name}") + if args.legal: + lgl = context['db']['legal'][voice_name] + a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}" + sys.stdout.write (f'\t{a}') + sys.stdout.write ('\n') voice_i = voice_i + 1 @@ -317,7 +243,7 @@ def run_preview (context, args): holz.info (f'Looking up preview info for {code}:{voice_i}:{speaker_i} ...') speaker_url = None - download_info = _assemble_download_info (context, code, voice_i) + download_info = db.assemble_download_info (context, code, voice_i) if download_info: if -1 < speaker_i and speaker_i < len (download_info['samples']): speaker_url = download_info['samples'][speaker_i] @@ -378,7 +304,7 @@ def run_install (context, args): index = context['db']['index'] langdb = context['db']['languages'] - download_info = _assemble_download_info (context, code, voice_i) + download_info = db.assemble_download_info (context, code, voice_i) if not download_info: holz.error ('Could not find any downloads for this configuration.') return 13 diff --git a/src/piper_whistle/db.py b/src/piper_whistle/db.py index b826f07..db7b0f0 100644 --- a/src/piper_whistle/db.py +++ b/src/piper_whistle/db.py @@ -111,6 +111,122 @@ def remote_repo_build_index_url (repo_info): return index_url +def assemble_download_info (context, code, voice_i): + """! Compile details used to download voice data. + + The assembled information may look like this: + { + 'langugage': en_GB, + 'model': { + 'url': "https://...", + 'size': 777, + 'md5': some md5 hash + }, + 'config': { + 'url': "https://...", + 'size': 777, + 'md5': some md5 hash + }, + 'samples': [ + list of URLs to a sample voice reading for each speaker + ], + 'local_path_relative': /some/local/path, + 'selection_name': selector name, + } + + @param context Context information and whistle database. + @param code Language code of voice to be downloaded. + @param voice_i Voice index to be downloaded. + @return Returns a map containing download information. + """ + index = context['db']['index'] + langdb = context['db']['languages'] + # For example: "https://huggingface.co/rhasspy/piper-voices/resolve/main" + base_url = remote_repo_build_branch_root (context['repo']) + + # Check if given code is available and collect meta info for later + # download and storage. + if code in langdb: + lang = langdb[code] + voices = lang["voices"] + # Select specific voice by index. + if -1 < voice_i and voice_i < len (voices): + voice_name = voices[voice_i] + holz.info (f'Requesting "{voice_name}" ...') + voice_details = index[voice_name] + + download_info = { + 'langugage': code, + 'model': None, + 'config': None, + 'card': None, + 'samples': [], + 'local_path_relative': f'{code}/{voice_name}', + 'selection_name': f"{code}:{voice_details['name']}@{voice_details['quality']}" + } + + # Identify onnx speech model files. + for file in voice_details['files']: + if file.endswith ('.onnx'): + download_info['model'] = { + 'url': f'{base_url}/{file}', + 'size': voice_details['files'][file]['size_bytes'], + 'md5': voice_details['files'][file]['md5_digest'] + } + elif file.endswith ('.onnx.json'): + download_info['config'] = { + 'url': f'{base_url}/{file}', + 'size': voice_details['files'][file]['size_bytes'], + 'md5': voice_details['files'][file]['md5_digest'] + } + elif file.endswith ('MODEL_CARD'): + download_info['card'] = { + 'url': f'{base_url}/{file}', + 'size': voice_details['files'][file]['size_bytes'], + 'md5': voice_details['files'][file]['md5_digest'] + } + + voice_base_url = os.path.dirname (download_info['model']['url']) + + def build_sample_url (base, speaker_name, speaker_id, ext = 'mp3'): + return f'{base}/samples/{speaker_name}_{speaker_id}.{ext}' + + # samples are based on speakers. + # there is always a speaker 0 by default. + if 1 >= int (voice_details['num_speakers']): + speaker_url = build_sample_url (voice_base_url, 'speaker', 0) + download_info['samples'].append (speaker_url) + else: + for speaker_name in voice_details['speaker_id_map']: + speaker_id = int(voice_details['speaker_id_map'][speaker_name]) + speaker_url = build_sample_url (voice_base_url, 'speaker', speaker_id) + download_info['samples'].append (speaker_url) + + return download_info + else: + holz.error (f'Invalid voice index!') + else: + holz.error (f'Cannot recognize: "{code}"') + + return None + + +def _fetch_url_raw (url): + """ + """ + temp = tempfile.NamedTemporaryFile () + temp_path = temp.name + temp.close () + + r = util.download_as_stream_with_progress (url, temp_path) + if 0 < r: + holz.debug (f'Finished downloading. "{url}" => "{temp_path}"') + with open (temp_path, 'r') as f: + return f.read () + + return None + + def index_fetch_raw_filelist (repo_info): """! Query the huggingface repository for a list of model files. @param repo_info Remote repo information map. Can be obtained via @ref "remote_repo_config ()". @@ -140,34 +256,58 @@ def index_fetch_raw (repo_info): """ url = remote_repo_build_index_url (repo_info) - """ - holz.info (f'Checking remote file "{url}" ...') - r = requests.head (url) - if not (300 > r.status_code): - holz.info (f'Could not find index file at "{url}".') - return None - - s = util.float_round (int (r.headers['Content-Length']) / 1024) - holz.info (f'Appears to be present with a size of "{s}kb".') - - holz.info (f'Fetching index file ...') - r = requests.get (url) - if 300 > r.status_code: - return json.loads (r.content.decode ('utf8')) - """ temp = tempfile.NamedTemporaryFile () temp_path = temp.name temp.close () - r = util.download_as_stream_with_progress (url, temp_path) - if 0 < r: - holz.debug (f'Finished downloading. "{url}" => "{temp_path}"') - with open (temp_path, 'r') as f: - return json.loads (f.read ()) + raw_text = _fetch_url_raw (url) + if raw_text: + return json.loads (raw_text) + holz.error (f'Could note fetch index.') return None +def _parse_model_card (card_text): + lines = [line.strip () for line in card_text.split ('\n')] + entry = { + 'dataset-url': None, + 'license': None, + 'training': None, + 'reference': None + } + + read_training_status = False + ln = 0 + for line in lines: + if 0 == len (line): + continue + + if read_training_status: + l = line.lower () + if l.startswith ('finetuned'): + entry['training'] = 'Tuned' + entry['reference'] = line.replace ('Finetuned from', '').strip () + elif l.startswith ('fine-tuned'): + entry['training'] = 'Tuned' + entry['reference'] = line.replace ('Fine-tuned from', '').strip () + else: + entry['training'] = 'Original' + entry['reference'] = line.replace ('Trained from scratch', '') + if entry['reference'].endswith ('.'): + entry['reference'] = entry['reference'][:-1] + read_training_status = False + elif line.startswith ('* License:'): + entry['license'] = line.replace ('* License:', '').strip () + elif line.startswith ('* URL:'): + entry['dataset-url'] = line.replace ('* URL:', '').strip () + elif line.startswith ('## Training'): + read_training_status = True + ln += 1 + + return entry + + def index_download_and_rebuild (paths = data_paths (), repo_info = remote_repo_config ()): """! Fetch latest voice index and build lookup database, then recreates context. @@ -219,8 +359,50 @@ def index_download_and_rebuild (paths = data_paths (), repo_info = remote_repo_c holz.info (f"Database files stored at '{paths['data']}'.") + # build legal info based on model cards + legal = {} + if True: + base_url = remote_repo_build_branch_root (repo_info) + + for code in langdb: + holz.info (f'Processing "{code}":') + voice_i = 0 + for voice_name in langdb[code]['voices']: + holz.info (f"\tFetching model card for {voice_i}: {voice_name}") + dl_info = None + voice_details = index[voice_name] + # Identify onnx speech model files. + for file in voice_details['files']: + if file.endswith ('MODEL_CARD'): + dl_info = { + 'url': f'{base_url}/{file}', + 'size': voice_details['files'][file]['size_bytes'], + 'md5': voice_details['files'][file]['md5_digest'] + } + + model_card_text = _fetch_url_raw (dl_info['url']) + card = _parse_model_card (model_card_text) + + holz.info (f'\tStoring license ...') + legal[voice_name] = card + + voice_i = voice_i + 1 + + with open (os.path.join (paths['data'], 'legal.json'), 'w') as f: + json.dump (legal, f, indent = 4) + holz.info ('Regenerating context ...') - return context_create (paths, repo_info) + context = context_create (paths, repo_info) + + """TODO: consider automatic lookup / best guess of corpus data + # corpus licenses lookup + # https://raw.githubusercontent.com/coqui-ai/open-speech-corpora/master/README.md + corpus_lookup_raw = _fetch_url_raw ('https://raw.githubusercontent.com/coqui-ai/open-speech-corpora/master/README.md') + if corpus_lookup_raw: + with open (os.path.join (paths['data'], 'corpus-lookup.md'), 'w') as f: + f.write (corpus_lookup_raw) + """ + return context def _context_is_valid (context): @@ -266,17 +448,24 @@ def context_create (paths = data_paths (), repo_info = remote_repo_config ()): } if not os.path.exists (paths['index']): - holz.warn ('No database index found!') + holz.error ('No database index found!') else: with open (paths['index'], 'r') as f: db['index'] = json.load (f) if not os.path.exists (paths['languages']): - holz.warn ('No language lookup found!') + holz.error ('No language lookup found!') else: with open (paths['languages'], 'r') as f: db['languages'] = json.load (f) + lgl_path = os.path.join (paths['data'], 'legal.json') + if not os.path.exists (lgl_path): + holz.error ('No legal lookup found!') + else: + with open (lgl_path, 'r') as f: + db['legal'] = json.load (f) + context = { 'paths': paths, 'db': db,