Skip to content
This repository has been archived by the owner on Apr 20, 2024. It is now read-only.

feat: add transcript normalization + m4a audio format support #552

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions google/cloud/speech_v2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from .types.cloud_speech import BatchRecognizeResponse
from .types.cloud_speech import BatchRecognizeResults
from .types.cloud_speech import BatchRecognizeTranscriptionMetadata
from .types.cloud_speech import CloudStorageResult
from .types.cloud_speech import Config
from .types.cloud_speech import CreateCustomClassRequest
from .types.cloud_speech import CreatePhraseSetRequest
Expand All @@ -44,6 +45,7 @@
from .types.cloud_speech import GetPhraseSetRequest
from .types.cloud_speech import GetRecognizerRequest
from .types.cloud_speech import InlineOutputConfig
from .types.cloud_speech import InlineResult
from .types.cloud_speech import ListCustomClassesRequest
from .types.cloud_speech import ListCustomClassesResponse
from .types.cloud_speech import ListPhraseSetsRequest
Expand All @@ -68,6 +70,7 @@
from .types.cloud_speech import StreamingRecognitionResult
from .types.cloud_speech import StreamingRecognizeRequest
from .types.cloud_speech import StreamingRecognizeResponse
from .types.cloud_speech import TranscriptNormalization
from .types.cloud_speech import UndeleteCustomClassRequest
from .types.cloud_speech import UndeletePhraseSetRequest
from .types.cloud_speech import UndeleteRecognizerRequest
Expand All @@ -87,6 +90,7 @@
"BatchRecognizeResponse",
"BatchRecognizeResults",
"BatchRecognizeTranscriptionMetadata",
"CloudStorageResult",
"Config",
"CreateCustomClassRequest",
"CreatePhraseSetRequest",
Expand All @@ -102,6 +106,7 @@
"GetPhraseSetRequest",
"GetRecognizerRequest",
"InlineOutputConfig",
"InlineResult",
"ListCustomClassesRequest",
"ListCustomClassesResponse",
"ListPhraseSetsRequest",
Expand All @@ -127,6 +132,7 @@
"StreamingRecognitionResult",
"StreamingRecognizeRequest",
"StreamingRecognizeResponse",
"TranscriptNormalization",
"UndeleteCustomClassRequest",
"UndeletePhraseSetRequest",
"UndeleteRecognizerRequest",
Expand Down
6 changes: 6 additions & 0 deletions google/cloud/speech_v2/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
BatchRecognizeResponse,
BatchRecognizeResults,
BatchRecognizeTranscriptionMetadata,
CloudStorageResult,
Config,
CreateCustomClassRequest,
CreatePhraseSetRequest,
Expand All @@ -37,6 +38,7 @@
GetPhraseSetRequest,
GetRecognizerRequest,
InlineOutputConfig,
InlineResult,
ListCustomClassesRequest,
ListCustomClassesResponse,
ListPhraseSetsRequest,
Expand All @@ -61,6 +63,7 @@
StreamingRecognitionResult,
StreamingRecognizeRequest,
StreamingRecognizeResponse,
TranscriptNormalization,
UndeleteCustomClassRequest,
UndeletePhraseSetRequest,
UndeleteRecognizerRequest,
Expand All @@ -80,6 +83,7 @@
"BatchRecognizeResponse",
"BatchRecognizeResults",
"BatchRecognizeTranscriptionMetadata",
"CloudStorageResult",
"Config",
"CreateCustomClassRequest",
"CreatePhraseSetRequest",
Expand All @@ -95,6 +99,7 @@
"GetPhraseSetRequest",
"GetRecognizerRequest",
"InlineOutputConfig",
"InlineResult",
"ListCustomClassesRequest",
"ListCustomClassesResponse",
"ListPhraseSetsRequest",
Expand All @@ -119,6 +124,7 @@
"StreamingRecognitionResult",
"StreamingRecognizeRequest",
"StreamingRecognizeResponse",
"TranscriptNormalization",
"UndeleteCustomClassRequest",
"UndeletePhraseSetRequest",
"UndeleteRecognizerRequest",
Expand Down
164 changes: 151 additions & 13 deletions google/cloud/speech_v2/types/cloud_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"ExplicitDecodingConfig",
"SpeakerDiarizationConfig",
"RecognitionFeatures",
"TranscriptNormalization",
"SpeechAdaptation",
"RecognitionConfig",
"RecognizeRequest",
Expand All @@ -58,6 +59,8 @@
"RecognitionOutputConfig",
"BatchRecognizeResponse",
"BatchRecognizeResults",
"CloudStorageResult",
"InlineResult",
"BatchRecognizeFileResult",
"BatchRecognizeTranscriptionMetadata",
"BatchRecognizeMetadata",
Expand Down Expand Up @@ -589,9 +592,14 @@ class Recognizer(proto.Message):
User-settable, human-readable name for the
Recognizer. Must be 63 characters or less.
model (str):
Optional. Which model to use for recognition requests.
Select the model best suited to your domain to get best
results.
Optional. This field is now deprecated. Prefer the
[``model``][google.cloud.speech.v2.RecognitionConfig.model]
field in the
[``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
message.

Which model to use for recognition requests. Select the
model best suited to your domain to get best results.

Guidance for choosing which model to use can be found in the
`Transcription Models
Expand All @@ -600,7 +608,13 @@ class Recognizer(proto.Message):
`Table Of Supported
Models <https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages>`__.
language_codes (MutableSequence[str]):
Optional. The language of the supplied audio as a
Optional. This field is now deprecated. Prefer the
[``language_codes``][google.cloud.speech.v2.RecognitionConfig.language_codes]
field in the
[``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
message.

The language of the supplied audio as a
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tag.

Expand Down Expand Up @@ -774,6 +788,8 @@ class AutoDetectDecodingConfig(proto.Message):

- WEBM_OPUS: Opus audio frames in a WebM container.

- M4A: M4A audio format.

"""


Expand Down Expand Up @@ -993,6 +1009,56 @@ class MultiChannelMode(proto.Enum):
)


class TranscriptNormalization(proto.Message):
r"""Transcription normalization configuration. Use transcription
normalization to automatically replace parts of the transcript
with phrases of your choosing. For StreamingRecognize, this
normalization only applies to stable partial transcripts
(stability > 0.8) and final transcripts.

Attributes:
entries (MutableSequence[google.cloud.speech_v2.types.TranscriptNormalization.Entry]):
A list of replacement entries. We will perform replacement
with one entry at a time. For example, the second entry in
["cat" => "dog", "mountain cat" => "mountain dog"] will
never be applied because we will always process the first
entry before it. At most 100 entries.
"""

class Entry(proto.Message):
r"""A single replacement configuration.

Attributes:
search (str):
What to replace. Max length is 100
characters.
replace (str):
What to replace with. Max length is 100
characters.
case_sensitive (bool):
Whether the search is case sensitive.
"""

search: str = proto.Field(
proto.STRING,
number=1,
)
replace: str = proto.Field(
proto.STRING,
number=2,
)
case_sensitive: bool = proto.Field(
proto.BOOL,
number=3,
)

entries: MutableSequence[Entry] = proto.RepeatedField(
proto.MESSAGE,
number=1,
message=Entry,
)


class SpeechAdaptation(proto.Message):
r"""Provides "hints" to the speech recognizer to favor specific
words and phrases in the results. PhraseSets can be specified as
Expand Down Expand Up @@ -1111,6 +1177,13 @@ class RecognitionConfig(proto.Message):
Speech adaptation context that weights
recognizer predictions for specific words and
phrases.
transcript_normalization (google.cloud.speech_v2.types.TranscriptNormalization):
Optional. Use transcription normalization to
automatically replace parts of the transcript
with phrases of your choosing. For
StreamingRecognize, this normalization only
applies to stable partial transcripts (stability
> 0.8) and final transcripts.
"""

auto_decoding_config: "AutoDetectDecodingConfig" = proto.Field(
Expand Down Expand Up @@ -1143,6 +1216,11 @@ class RecognitionConfig(proto.Message):
number=6,
message="SpeechAdaptation",
)
transcript_normalization: "TranscriptNormalization" = proto.Field(
proto.MESSAGE,
number=11,
message="TranscriptNormalization",
)


class RecognizeRequest(proto.Message):
Expand Down Expand Up @@ -1822,29 +1900,73 @@ class BatchRecognizeResults(proto.Message):
)


class BatchRecognizeFileResult(proto.Message):
r"""Final results for a single file.
class CloudStorageResult(proto.Message):
r"""Final results written to Cloud Storage.

Attributes:
uri (str):
The Cloud Storage URI to which recognition
results were written.
"""

uri: str = proto.Field(
proto.STRING,
number=1,
)


class InlineResult(proto.Message):
r"""Final results returned inline in the recognition response.

Attributes:
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
The transcript for the audio file.
"""

transcript: "BatchRecognizeResults" = proto.Field(
proto.MESSAGE,
number=1,
message="BatchRecognizeResults",
)


class BatchRecognizeFileResult(proto.Message):
r"""Final results for a single file.

This message has `oneof`_ fields (mutually exclusive fields).
For each oneof, at most one member field can be set at the same time.
Setting any member of the oneof automatically clears all other
members.

.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields

Attributes:
error (google.rpc.status_pb2.Status):
Error if one was encountered.
metadata (google.cloud.speech_v2.types.RecognitionResponseMetadata):

transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
The transcript for the audio file. This is populated only
when
cloud_storage_result (google.cloud.speech_v2.types.CloudStorageResult):
Recognition results written to Cloud Storage. This is
populated only when
[GcsOutputConfig][google.cloud.speech.v2.GcsOutputConfig] is
set in the
[RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].

This field is a member of `oneof`_ ``result``.
inline_result (google.cloud.speech_v2.types.InlineResult):
Recognition results. This is populated only when
[InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig]
is set in the
[RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].

This field is a member of `oneof`_ ``result``.
uri (str):
Deprecated. Use ``cloud_storage_result.native_format_uri``
instead.
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
Deprecated. Use ``inline_result.transcript`` instead.
"""

uri: str = proto.Field(
proto.STRING,
number=1,
)
error: status_pb2.Status = proto.Field(
proto.MESSAGE,
number=2,
Expand All @@ -1855,6 +1977,22 @@ class BatchRecognizeFileResult(proto.Message):
number=3,
message="RecognitionResponseMetadata",
)
cloud_storage_result: "CloudStorageResult" = proto.Field(
proto.MESSAGE,
number=5,
oneof="result",
message="CloudStorageResult",
)
inline_result: "InlineResult" = proto.Field(
proto.MESSAGE,
number=6,
oneof="result",
message="InlineResult",
)
uri: str = proto.Field(
proto.STRING,
number=1,
)
transcript: "BatchRecognizeResults" = proto.Field(
proto.MESSAGE,
number=4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
],
"language": "PYTHON",
"name": "google-cloud-speech",
"version": "2.21.1"
"version": "0.1.0"
},
"snippets": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
],
"language": "PYTHON",
"name": "google-cloud-speech",
"version": "2.21.1"
"version": "0.1.0"
},
"snippets": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
],
"language": "PYTHON",
"name": "google-cloud-speech",
"version": "2.21.1"
"version": "0.1.0"
},
"snippets": [
{
Expand Down
Loading