Source code for native_client.python

import os
import platform

#The API is not snake case which triggers linter errors
#pylint: disable=invalid-name

if platform.system().lower() == "windows":
    dslib_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'lib')

    # On Windows, we can't rely on RPATH being set to $ORIGIN/lib/ or on
    # @loader_path/lib
    if hasattr(os, 'add_dll_directory'):
        # Starting with Python 3.8 this properly handles the problem
        os.add_dll_directory(dslib_path)
    else:
        # Before Pythin 3.8 we need to change the PATH to include the proper
        # directory for the dynamic linker
        os.environ['PATH'] = dslib_path + ';' + os.environ['PATH']

import deepspeech

# rename for backwards compatibility
from deepspeech.impl import PrintVersions as printVersions
from deepspeech.impl import FreeStream as freeStream

[docs]class Model(object):
    """
    Class holding a DeepSpeech model

    :param aModelPath: Path to model file to load
    :type aModelPath: str

    :param aBeamWidth: Decoder beam width
    :type aBeamWidth: int
    """
    def __init__(self,  *args, **kwargs):
        # make sure the attribute is there if CreateModel fails
        self._impl = None

        status, impl = deepspeech.impl.CreateModel(*args, **kwargs)
        if status != 0:
            raise RuntimeError("CreateModel failed with error code {}".format(status))
        self._impl = impl

    def __del__(self):
        if self._impl:
            deepspeech.impl.FreeModel(self._impl)
            self._impl = None

[docs]    def sampleRate(self):
        """
        Return the sample rate expected by the model.

        :return: Sample rate.
        :type: int
        """
        return deepspeech.impl.GetModelSampleRate(self._impl)

[docs]    def enableDecoderWithLM(self, *args, **kwargs):
        """
        Enable decoding using beam scoring with a KenLM language model.

        :param aLMPath: The path to the language model binary file.
        :type aLMPath: str

        :param aTriePath: The path to the trie file build from the same vocabulary as the language model binary.
        :type aTriePath: str

        :param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight.
        :type aLMAlpha: float

        :param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight.
        :type aLMBeta: float

        :return: Zero on success, non-zero on failure (invalid arguments).
        :type: int
        """
        return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs)

[docs]    def stt(self, *args, **kwargs):
        """
        Use the DeepSpeech model to perform Speech-To-Text.

        :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type aBuffer: int array

        :param aBufferSize: The number of samples in the audio signal.
        :type aBufferSize: int

        :return: The STT result.
        :type: str
        """
        return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)

[docs]    def sttWithMetadata(self, *args, **kwargs):
        """
        Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.

        :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type aBuffer: int array

        :param aBufferSize: The number of samples in the audio signal.
        :type aBufferSize: int

        :return: Outputs a struct of individual letters along with their timing information.
        :type: :func:`Metadata`
        """
        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)

[docs]    def createStream(self):
        """
        Create a new streaming inference state. The streaming state returned
        by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.

        :return: Object holding the stream

        :throws: RuntimeError on error
        """
        status, ctx = deepspeech.impl.CreateStream(self._impl)
        if status != 0:
            raise RuntimeError("CreateStream failed with error code {}".format(status))
        return ctx

    # pylint: disable=no-self-use
[docs]    def feedAudioContent(self, *args, **kwargs):
        """
        Feed audio samples to an ongoing streaming inference.

        :param aSctx: A streaming state pointer returned by :func:`createStream()`.
        :type aSctx: object

        :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
        :type aBuffer: int array

        :param aBufferSize: The number of samples in @p aBuffer.
        :type aBufferSize: int
        """
        deepspeech.impl.FeedAudioContent(*args, **kwargs)

    # pylint: disable=no-self-use
[docs]    def intermediateDecode(self, *args, **kwargs):
        """
        Compute the intermediate decoding of an ongoing streaming inference.

        :param aSctx: A streaming state pointer returned by :func:`createStream()`.
        :type aSctx: object

        :return: The STT intermediate result.
        :type: str
        """
        return deepspeech.impl.IntermediateDecode(*args, **kwargs)

    # pylint: disable=no-self-use
[docs]    def finishStream(self, *args, **kwargs):
        """
        Signal the end of an audio signal to an ongoing streaming
        inference, returns the STT result over the whole audio signal.

        :param aSctx: A streaming state pointer returned by :func:`createStream()`.
        :type aSctx: object

        :return: The STT result.
        :type: str
        """
        return deepspeech.impl.FinishStream(*args, **kwargs)

    # pylint: disable=no-self-use
[docs]    def finishStreamWithMetadata(self, *args, **kwargs):
        """
        Signal the end of an audio signal to an ongoing streaming
        inference, returns per-letter metadata.

        :param aSctx: A streaming state pointer returned by :func:`createStream()`.
        :type aSctx: object

        :return: Outputs a struct of individual letters along with their timing information.
        :type: :func:`Metadata`
        """
        return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)

# This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
[docs]class MetadataItem(object):
    """
    Stores each individual character, along with its timing information
    """

[docs]    def character(self):
        """
        The character generated for transcription
        """
        # pylint: disable=unnecessary-pass
        pass

[docs]    def timestep(self):
        """
        Position of the character in units of 20ms
        """
        # pylint: disable=unnecessary-pass
        pass

[docs]    def start_time(self):
        """
        Position of the character in seconds
        """
        # pylint: disable=unnecessary-pass
        pass


[docs]class Metadata(object):
    """
    Stores the entire CTC output as an array of character metadata objects
    """
[docs]    def items(self):
        """
        List of items

        :return: A list of :func:`MetadataItem` elements
        :type: list
        """
        # pylint: disable=unnecessary-pass
        pass

[docs]    def num_items(self):
        """
        Size of the list of items

        :return: Size of the list of items
        :type: int
        """
        # pylint: disable=unnecessary-pass
        pass

[docs]    def confidence(self):
        """
        Approximated confidence value for this transcription. This is roughly the
        sum of the acoustic model logit values for each timestep/character that
        contributed to the creation of this transcription.
        """
        # pylint: disable=unnecessary-pass
        pass