From c80ad33a5010a73cf9d0e5ddb3f858a54cf256b5 Mon Sep 17 00:00:00 2001
From: blurryroots <blurryroots@posteo.de>
Date: Thu, 7 Dec 2023 15:42:45 +0100
Subject: [PATCH] best effort to compile legal information

added 'show legal info' flag '-g' to list command
download, parse and store model-specific legal info (model card)
moved assemble_download_info into db module
changed fetch function to download raw data, then added json wrapper
added model card parser function to db
---
 src/piper_whistle/__init__.py |   2 +-
 src/piper_whistle/cli.py      |   5 +
 src/piper_whistle/cmds.py     | 122 ++++--------------
 src/piper_whistle/db.py       | 235 ++++++++++++++++++++++++++++++----
 4 files changed, 242 insertions(+), 122 deletions(-)

diff --git a/src/piper_whistle/__init__.py b/src/piper_whistle/__init__.py
index 36e9363..c529ee8 100644
--- a/src/piper_whistle/__init__.py
+++ b/src/piper_whistle/__init__.py
@@ -4,7 +4,7 @@
 
 
 def version ():
-	return "1.6.28"
+	return "1.6.33"
 
 
 if '__main__' == __name__:
diff --git a/src/piper_whistle/cli.py b/src/piper_whistle/cli.py
index 4326406..45f29b3 100644
--- a/src/piper_whistle/cli.py
+++ b/src/piper_whistle/cli.py
@@ -128,6 +128,11 @@ def create_arg_parser ():
 		, help='List available languages.'
 		, default=False
 	)
+	list_args.add_argument ('-g', '--legal'
+		, action='store_true'
+		, help='Show avaiable legal information.'
+		, default=False
+	)
 	list_args.add_argument ('-p', '--install-path'
 		, action='store_true'
 		, help='List path of voice (if installed).'
diff --git a/src/piper_whistle/cmds.py b/src/piper_whistle/cmds.py
index a257617..f3c6618 100644
--- a/src/piper_whistle/cmds.py
+++ b/src/piper_whistle/cmds.py
@@ -63,99 +63,6 @@ def _parse_voice_selector (selector):
 	return name, quality, speaker
 
 
-def _assemble_download_info (context, code, voice_i):
-	"""! Compile details used to download voice data.
-
-	The assembled information may look like this:
-	{
-		'langugage': en_GB,
-		'model': {
-			'url': "https://...",
-			'size': 777,
-			'md5': some md5 hash
-		},
-		'config': {
-			'url': "https://...",
-			'size': 777,
-			'md5': some md5 hash
-		},
-		'samples': [
-			list of URLs to a sample voice reading for each speaker
-		],
-		'local_path_relative': /some/local/path,
-		'selection_name': selector name,
-	}	
-
-	@param context Context information and whistle database.
-	@param code Language code of voice to be downloaded.
-	@param voice_i Voice index to be downloaded.
-	@return Returns a map containing download information.
-	"""
-	index = context['db']['index']
-	langdb = context['db']['languages']
-	# For example: "https://huggingface.co/rhasspy/piper-voices/resolve/main"
-	base_url = db.remote_repo_build_branch_root (context['repo'])
-
-	# Check if given code is available and collect meta info for later
-	# download and storage.
-	if code in langdb:
-		lang = langdb[code]
-		voices = lang["voices"]
-		# Select specific voice by index.
-		if -1 < voice_i and voice_i < len (voices):
-			voice_name = voices[voice_i]
-			holz.info (f'Requesting "{voice_name}" ...')
-			voice_details = index[voice_name]
-			
-			download_info = {
-				'langugage': code,
-				'model': None,
-				'config': None,
-				'samples': [],
-				'local_path_relative': f'{code}/{voice_name}',
-				'selection_name': f"{code}:{voice_details['name']}@{voice_details['quality']}"
-			}
-
-			# Identify onnx speech model files.
-			for file in voice_details['files']:
-				if file.endswith ('.onnx'):
-					download_info['model'] = {
-						'url': f'{base_url}/{file}',
-						'size': voice_details['files'][file]['size_bytes'],
-						'md5': voice_details['files'][file]['md5_digest']
-					}
-				elif file.endswith ('.onnx.json'):
-					download_info['config'] = {
-						'url': f'{base_url}/{file}',
-						'size': voice_details['files'][file]['size_bytes'],
-						'md5': voice_details['files'][file]['md5_digest']
-					}
-
-			voice_base_url = os.path.dirname (download_info['model']['url'])
-
-			def build_sample_url (base, speaker_name, speaker_id, ext = 'mp3'):
-				return f'{base}/samples/{speaker_name}_{speaker_id}.{ext}'
-
-			# samples are based on speakers.
-			# there is always a speaker 0 by default.
-			if 1 >= int (voice_details['num_speakers']):
-				speaker_url = build_sample_url (voice_base_url, 'speaker', 0)
-				download_info['samples'].append (speaker_url)
-			else:
-				for speaker_name in voice_details['speaker_id_map']:
-					speaker_id = int(voice_details['speaker_id_map'][speaker_name])
-					speaker_url = build_sample_url (voice_base_url, 'speaker', speaker_id)
-					download_info['samples'].append (speaker_url)
-
-			return download_info
-		else:
-			holz.error (f'Invalid voice index!')
-	else:
-		holz.error (f'Cannot recognize: "{code}"')
-
-	return None
-
-
 def run_guess (context, args):
 	"""! Run command 'guess'
 	@param context Context information and whistle database.
@@ -254,6 +161,10 @@ def run_list (context, args):
 			sys.stdout.write (f"\t{model['code']}:{model['name']}@{model['quality']}")
 			if args.verbose:
 				sys.stdout.write (f"\t{model['path']}")
+			if args.legal:
+				lgl = context['db']['legal'][model['name']]
+				a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}"
+				sys.stdout.write (f"\t{a}")
 			sys.stdout.write ("\n")
 
 		return 0
@@ -265,7 +176,12 @@ def run_list (context, args):
 			sys.stdout.write (f'Voices for "{code}":\n')
 			voice_i = 0
 			for voice_name in lang['voices']:
-				sys.stdout.write (f"\t{voice_i}: {voice_name}\n")
+				details = voice_name
+				if args.legal:
+					lgl = context['db']['legal'][voice_name]
+					a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}"
+					details = f"{details} ({a})"
+				sys.stdout.write (f"\t{voice_i}: {details}\n")
 
 				voice_i = voice_i + 1
 
@@ -283,7 +199,12 @@ def run_list (context, args):
 	if -1 < voice_i:
 		voice_name = lang['voices'][voice_i]
 
-		sys.stdout.write (f'{voice_name}\t{voice_i}\n')
+		sys.stdout.write (f'{voice_name}\t{voice_i}')
+		if args.legal:
+			lgl = context['db']['legal'][voice_name]
+			a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}"
+			sys.stdout.write (f'\t{a}')
+		sys.stdout.write ('\n')
 		speakers = index[voice_name]['speaker_id_map']
 		sys.stdout.write (f'Speakers:\n')
 		if 0 == len (speakers):
@@ -295,7 +216,12 @@ def run_list (context, args):
 		sys.stdout.write (f'Available voices ({code}):\n')
 		voice_i = 0
 		for voice_name in lang['voices']:
-			sys.stdout.write (f"\t{voice_i}: {voice_name}\n")
+			sys.stdout.write (f"\t{voice_i}: {voice_name}")
+			if args.legal:
+				lgl = context['db']['legal'][voice_name]
+				a = f"Voice[{lgl['training']}]: {lgl['license']}, Reference: {lgl['reference']}, Dataset: {lgl['dataset-url']}"
+				sys.stdout.write (f'\t{a}')
+			sys.stdout.write ('\n')
 
 			voice_i = voice_i + 1
 
@@ -317,7 +243,7 @@ def run_preview (context, args):
 	
 	holz.info (f'Looking up preview info for {code}:{voice_i}:{speaker_i} ...')
 	speaker_url = None
-	download_info = _assemble_download_info (context, code, voice_i)
+	download_info = db.assemble_download_info (context, code, voice_i)
 	if download_info:
 		if -1 < speaker_i and speaker_i < len (download_info['samples']):
 			speaker_url = download_info['samples'][speaker_i]
@@ -378,7 +304,7 @@ def run_install (context, args):
 	index = context['db']['index']
 	langdb = context['db']['languages']
 
-	download_info = _assemble_download_info (context, code, voice_i)
+	download_info = db.assemble_download_info (context, code, voice_i)
 	if not download_info:
 		holz.error ('Could not find any downloads for this configuration.')
 		return 13
diff --git a/src/piper_whistle/db.py b/src/piper_whistle/db.py
index b826f07..db7b0f0 100644
--- a/src/piper_whistle/db.py
+++ b/src/piper_whistle/db.py
@@ -111,6 +111,122 @@ def remote_repo_build_index_url (repo_info):
 	return index_url
 
 
+def assemble_download_info (context, code, voice_i):
+	"""! Compile details used to download voice data.
+
+	The assembled information may look like this:
+	{
+		'langugage': en_GB,
+		'model': {
+			'url': "https://...",
+			'size': 777,
+			'md5': some md5 hash
+		},
+		'config': {
+			'url': "https://...",
+			'size': 777,
+			'md5': some md5 hash
+		},
+		'samples': [
+			list of URLs to a sample voice reading for each speaker
+		],
+		'local_path_relative': /some/local/path,
+		'selection_name': selector name,
+	}	
+
+	@param context Context information and whistle database.
+	@param code Language code of voice to be downloaded.
+	@param voice_i Voice index to be downloaded.
+	@return Returns a map containing download information.
+	"""
+	index = context['db']['index']
+	langdb = context['db']['languages']
+	# For example: "https://huggingface.co/rhasspy/piper-voices/resolve/main"
+	base_url = remote_repo_build_branch_root (context['repo'])
+
+	# Check if given code is available and collect meta info for later
+	# download and storage.
+	if code in langdb:
+		lang = langdb[code]
+		voices = lang["voices"]
+		# Select specific voice by index.
+		if -1 < voice_i and voice_i < len (voices):
+			voice_name = voices[voice_i]
+			holz.info (f'Requesting "{voice_name}" ...')
+			voice_details = index[voice_name]
+			
+			download_info = {
+				'langugage': code,
+				'model': None,
+				'config': None,
+				'card': None,
+				'samples': [],
+				'local_path_relative': f'{code}/{voice_name}',
+				'selection_name': f"{code}:{voice_details['name']}@{voice_details['quality']}"
+			}
+
+			# Identify onnx speech model files.
+			for file in voice_details['files']:
+				if file.endswith ('.onnx'):
+					download_info['model'] = {
+						'url': f'{base_url}/{file}',
+						'size': voice_details['files'][file]['size_bytes'],
+						'md5': voice_details['files'][file]['md5_digest']
+					}
+				elif file.endswith ('.onnx.json'):
+					download_info['config'] = {
+						'url': f'{base_url}/{file}',
+						'size': voice_details['files'][file]['size_bytes'],
+						'md5': voice_details['files'][file]['md5_digest']
+					}
+				elif file.endswith ('MODEL_CARD'):
+					download_info['card'] = {
+						'url': f'{base_url}/{file}',
+						'size': voice_details['files'][file]['size_bytes'],
+						'md5': voice_details['files'][file]['md5_digest']
+					}
+
+			voice_base_url = os.path.dirname (download_info['model']['url'])
+
+			def build_sample_url (base, speaker_name, speaker_id, ext = 'mp3'):
+				return f'{base}/samples/{speaker_name}_{speaker_id}.{ext}'
+
+			# samples are based on speakers.
+			# there is always a speaker 0 by default.
+			if 1 >= int (voice_details['num_speakers']):
+				speaker_url = build_sample_url (voice_base_url, 'speaker', 0)
+				download_info['samples'].append (speaker_url)
+			else:
+				for speaker_name in voice_details['speaker_id_map']:
+					speaker_id = int(voice_details['speaker_id_map'][speaker_name])
+					speaker_url = build_sample_url (voice_base_url, 'speaker', speaker_id)
+					download_info['samples'].append (speaker_url)
+
+			return download_info
+		else:
+			holz.error (f'Invalid voice index!')
+	else:
+		holz.error (f'Cannot recognize: "{code}"')
+
+	return None
+
+
+def _fetch_url_raw (url):
+	"""
+	"""
+	temp = tempfile.NamedTemporaryFile ()
+	temp_path = temp.name
+	temp.close ()
+
+	r = util.download_as_stream_with_progress (url, temp_path)
+	if 0 < r:
+		holz.debug (f'Finished downloading. "{url}" => "{temp_path}"')
+		with open (temp_path, 'r') as f:
+			return f.read ()
+
+	return None
+
+
 def index_fetch_raw_filelist (repo_info):
 	"""! Query the huggingface repository for a list of model files.
 	@param repo_info Remote repo information map. Can be obtained via @ref "remote_repo_config ()".
@@ -140,34 +256,58 @@ def index_fetch_raw (repo_info):
 	"""
 	url = remote_repo_build_index_url (repo_info)
 
-	"""
-	holz.info (f'Checking remote file "{url}" ...')
-	r = requests.head (url)
-	if not (300 > r.status_code):
-		holz.info (f'Could not find index file at "{url}".')
-		return None
-
-	s = util.float_round (int (r.headers['Content-Length']) / 1024)
-	holz.info (f'Appears to be present with a size of "{s}kb".')
-
-	holz.info (f'Fetching index file ...')
-	r = requests.get (url)
-	if 300 > r.status_code:
-		return json.loads (r.content.decode ('utf8'))
-	"""
 	temp = tempfile.NamedTemporaryFile ()
 	temp_path = temp.name
 	temp.close ()
 
-	r = util.download_as_stream_with_progress (url, temp_path)
-	if 0 < r:
-		holz.debug (f'Finished downloading. "{url}" => "{temp_path}"')
-		with open (temp_path, 'r') as f:
-			return json.loads (f.read ())
+	raw_text = _fetch_url_raw (url)
+	if raw_text:
+		return json.loads (raw_text)
 
+	holz.error (f'Could note fetch index.')
 	return None
 
 
+def _parse_model_card (card_text):
+	lines = [line.strip () for line in card_text.split ('\n')]
+	entry = {
+		'dataset-url': None,
+		'license': None,
+		'training': None,
+		'reference': None
+	}
+
+	read_training_status = False
+	ln = 0
+	for line in lines:
+		if 0 == len (line):
+			continue
+
+		if read_training_status:
+			l = line.lower ()
+			if l.startswith ('finetuned'):
+				entry['training'] = 'Tuned'
+				entry['reference'] = line.replace ('Finetuned from', '').strip ()
+			elif l.startswith ('fine-tuned'):
+				entry['training'] = 'Tuned'
+				entry['reference'] = line.replace ('Fine-tuned from', '').strip ()
+			else:
+				entry['training'] = 'Original'
+				entry['reference'] = line.replace ('Trained from scratch', '')
+				if entry['reference'].endswith ('.'):
+					entry['reference'] = entry['reference'][:-1]
+			read_training_status = False
+		elif line.startswith ('* License:'):
+			entry['license'] = line.replace ('* License:', '').strip ()
+		elif line.startswith ('* URL:'):
+			entry['dataset-url'] = line.replace ('* URL:', '').strip ()
+		elif line.startswith ('## Training'):
+			read_training_status = True
+		ln += 1
+
+	return entry
+
+
 def index_download_and_rebuild (paths = data_paths (), repo_info = remote_repo_config ()):
 	"""! Fetch latest voice index and build lookup database, then recreates context.
 
@@ -219,8 +359,50 @@ def index_download_and_rebuild (paths = data_paths (), repo_info = remote_repo_c
 
 	holz.info (f"Database files stored at '{paths['data']}'.")
 
+	# build legal info based on model cards
+	legal = {}
+	if True:
+		base_url = remote_repo_build_branch_root (repo_info)
+
+		for code in langdb:
+			holz.info (f'Processing "{code}":')				
+			voice_i = 0
+			for voice_name in langdb[code]['voices']:
+				holz.info (f"\tFetching model card for {voice_i}: {voice_name}")
+				dl_info = None
+				voice_details = index[voice_name]
+				# Identify onnx speech model files.
+				for file in voice_details['files']:
+					if file.endswith ('MODEL_CARD'):
+						dl_info = {
+							'url': f'{base_url}/{file}',
+							'size': voice_details['files'][file]['size_bytes'],
+							'md5': voice_details['files'][file]['md5_digest']
+						}
+
+				model_card_text = _fetch_url_raw (dl_info['url'])
+				card = _parse_model_card (model_card_text)
+
+				holz.info (f'\tStoring license ...')
+				legal[voice_name] = card
+
+				voice_i = voice_i + 1
+
+		with open (os.path.join (paths['data'], 'legal.json'), 'w') as f:
+			json.dump (legal, f, indent = 4)
+
 	holz.info ('Regenerating context ...')
-	return context_create (paths, repo_info)
+	context = context_create (paths, repo_info)
+
+	"""TODO: consider automatic lookup / best guess of corpus data
+	# corpus licenses lookup
+	# https://raw.githubusercontent.com/coqui-ai/open-speech-corpora/master/README.md
+	corpus_lookup_raw = _fetch_url_raw ('https://raw.githubusercontent.com/coqui-ai/open-speech-corpora/master/README.md')
+	if corpus_lookup_raw:
+		with open (os.path.join (paths['data'], 'corpus-lookup.md'), 'w') as f:
+			f.write (corpus_lookup_raw)
+	"""
+	return context
 
 
 def _context_is_valid (context):
@@ -266,17 +448,24 @@ def context_create (paths = data_paths (), repo_info = remote_repo_config ()):
 	}
 
 	if not os.path.exists (paths['index']):
-		holz.warn ('No database index found!')
+		holz.error ('No database index found!')
 	else:
 		with open (paths['index'], 'r') as f:
 			db['index'] = json.load (f)
 
 	if not os.path.exists (paths['languages']):
-		holz.warn ('No language lookup found!')
+		holz.error ('No language lookup found!')
 	else:
 		with open (paths['languages'], 'r') as f:
 			db['languages'] = json.load (f)
 
+	lgl_path = os.path.join (paths['data'], 'legal.json')
+	if not os.path.exists (lgl_path):
+		holz.error ('No legal lookup found!')
+	else:
+		with open (lgl_path, 'r') as f:
+			db['legal'] = json.load (f)
+
 	context = {
 		'paths': paths,
 		'db': db,