testing_webcam_voiceenabled_voiceactivated.py

import sys
import argparse
import cv2
import time
from libfaceid.detector import FaceDetectorModels, FaceDetector
from libfaceid.encoder  import FaceEncoderModels, FaceEncoder
from libfaceid.speech_synthesizer import SpeechSynthesizerModels, SpeechSynthesizer
from libfaceid.speech_recognizer  import SpeechRecognizerModels,  SpeechRecognizer


# Set the window name
WINDOW_NAME = "Facial_Recognition"

# Set the input directories
INPUT_DIR_DATASET               = "datasets"
INPUT_DIR_MODEL_DETECTION       = "models/detection/"
INPUT_DIR_MODEL_ENCODING        = "models/encoding/"
INPUT_DIR_MODEL_TRAINING        = "models/training/"
INPUT_DIR_MODEL_ESTIMATION      = "models/estimation/"
INPUT_DIR_AUDIOSET              = "audiosets"

# Set width and height
RESOLUTION_QVGA   = (320, 240)
RESOLUTION_VGA    = (640, 480)
RESOLUTION_HD     = (1280, 720)
RESOLUTION_FULLHD = (1920, 1080)

# Set the trigger words
TRIGGER_WORDS = ["Hey Google", "Alexa", "Activate", "Open Sesame", "Panel"]


def cam_init(cam_index, width, height): 
    cap = cv2.VideoCapture(cam_index)
    if sys.version_info < (3, 0):
        cap.set(cv2.cv.CV_CAP_PROP_FPS, 30)
        cap.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH,  width)
        cap.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, height)
    else:
        cap.set(cv2.CAP_PROP_FPS, 30)
        cap.set(cv2.CAP_PROP_FRAME_WIDTH,  width)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
    return cap


def label_face(frame, face_rect, face_id, confidence):
    (x, y, w, h) = face_rect
    cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 255, 255), 1)
    if face_id is not None:
        cv2.putText(frame, "{} {:.2f}%".format(face_id, confidence), 
            (x+5,y+h-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)


trigger_word_detected = False
def speech_recognizer_callback(word):
    print("Trigger word detected! '{}'".format(word))
    global trigger_word_detected
    trigger_word_detected = True


def process_facerecognition(model_detector, model_recognizer, model_speech_synthesizer, model_speech_recognizer, cam_index, cam_resolution):
    
    # Initialize speech-to-text (speech recognizer) for voice-activated capability (wake-word/hot-word/trigger-word detection)
    # Then wait for trigger word before starting face recognition
    if True:
        speech_recognizer = SpeechRecognizer(model=model_speech_recognizer, path=None)
        print("\nWaiting for a trigger word: {}".format(TRIGGER_WORDS))
        speech_recognizer.start(TRIGGER_WORDS, speech_recognizer_callback)
        global trigger_word_detected
        try:
            while (trigger_word_detected == False):
                time.sleep(1)
        except:
            pass
        speech_recognizer.stop()
        speech_recognizer = None

    
    # Initialize the camera
    camera = cam_init(cam_index, cam_resolution[0], cam_resolution[1])

    try:
        # Initialize face detection
        face_detector = FaceDetector(model=model_detector, path=INPUT_DIR_MODEL_DETECTION)

        # Initialize face recognizer
        face_encoder = FaceEncoder(model=model_recognizer, path=INPUT_DIR_MODEL_ENCODING, path_training=INPUT_DIR_MODEL_TRAINING, training=False)

        # Initialize text-to-speech (speech synthesizer) for voice-enabled capability
        speech_synthesizer = SpeechSynthesizer(model=model_speech_synthesizer, path=None, path_output=None, training=False)

    except:
        face_encoder = None
        print("Warning, check if models and trained dataset models exists!")
    face_id, confidence = (None, 0)


    # Start face recognition
    frame_count = 0
    while (True):

        # Capture frame from webcam
        ret, frame = camera.read()
        if frame is None:
            print("Error, check if camera is connected!")
            break


        # Detect and identify faces in the frame
        faces = face_detector.detect(frame)
        for (index, face) in enumerate(faces):
            (x, y, w, h) = face
            # Indentify face based on trained dataset (note: should run facial_recognition_training.py)
            if face_encoder is not None:
                face_id, confidence = face_encoder.identify(frame, (x, y, w, h))
            # Set text and bounding box on face
            label_face(frame, (x, y, w, h), face_id, confidence)

            # Play audio file corresponding to the recognized name 
            if (frame_count % 30 == 0):
                if len(faces) == 1 and (face_id is not None) and (face_id != "Unknown"):
                    speech_synthesizer.playaudio(INPUT_DIR_AUDIOSET, face_id, block=False)

            # Process 1 face only
            break


        # Display updated frame
        cv2.imshow(WINDOW_NAME, frame)

        # Check for user actions
        if cv2.waitKey(1) & 0xFF == 27: # ESC
            break

        frame_count += 1

    # Release the camera
    camera.release()
    cv2.destroyAllWindows()


def run(cam_index, cam_resolution):
    detector=FaceDetectorModels.HAARCASCADE
#    detector=FaceDetectorModels.DLIBHOG
#    detector=FaceDetectorModels.DLIBCNN
#    detector=FaceDetectorModels.SSDRESNET
#    detector=FaceDetectorModels.MTCNN
#    detector=FaceDetectorModels.FACENET

    encoder=FaceEncoderModels.LBPH
#    encoder=FaceEncoderModels.OPENFACE
#    encoder=FaceEncoderModels.DLIBRESNET
#    encoder=FaceEncoderModels.FACENET

    speech_synthesizer=SpeechSynthesizerModels.TTSX3
#    speech_synthesizer=SpeechSynthesizerModels.TACOTRON
#    speech_synthesizer=SpeechSynthesizerModels.GOOGLECLOUD

    speech_recognizer=SpeechRecognizerModels.GOOGLECLOUD
#    speech_recognizer=SpeechRecognizerModels.WITAI
#    speech_recognizer=SpeechRecognizerModels.HOUNDIFY

    process_facerecognition(detector, encoder, speech_synthesizer, speech_recognizer, cam_index, cam_resolution)


def main(args):
    if sys.version_info < (3, 0):
        print("Error: Python2 is slow. Use Python3 for max performance.")
        return

    cam_index = int(args.webcam)
    resolutions = [ RESOLUTION_QVGA, RESOLUTION_VGA, RESOLUTION_HD, RESOLUTION_FULLHD ]
    try:
        cam_resolution = resolutions[int(args.resolution)]
    except:
        cam_resolution = RESOLUTION_QVGA

    if args.detector and args.encoder and args.speech_synthesizer and args.speech_recognizer:
        try:
            detector = FaceDetectorModels(int(args.detector))
            encoder = FaceEncoderModels(int(args.encoder))
            speech_synthesizer = SpeechSynthesizerModels(int(args.speech_synthesizer))
            speech_recognizer = SpeechRecognizerModels(int(args.speech_recognizer))

            print( "Parameters: {} {} {} {}".format(detector, encoder, speech_synthesizer, speech_recognizer) )
            process_facerecognition(detector, encoder, speech_synthesizer, speech_recognizer, cam_index, cam_resolution)
        except:
            print( "Invalid parameter" )
        return
    run(cam_index, cam_resolution)


def parse_arguments(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('--detector', required=False, default=0, 
        help='Detector model to use. Options: 0-HAARCASCADE, 1-DLIBHOG, 2-DLIBCNN, 3-SSDRESNET, 4-MTCNN, 5-FACENET')
    parser.add_argument('--encoder', required=False, default=0, 
        help='Encoder model to use. Options: 0-LBPH, 1-OPENFACE, 2-DLIBRESNET, 3-FACENET')
    parser.add_argument('--speech_synthesizer', required=False, default=0, 
        help='Speech synthesizer model to use. Options: 0-TTSX3, 1-TACOTRON, 2-GOOGLECLOUD')
    parser.add_argument('--speech_recognizer', required=False, default=0, 
        help='Speech recognizer model to use. Options: 0-GOOGLECLOUD, 1-WITAI, 2-HOUNDIFY')
    parser.add_argument('--webcam', required=False, default=0, 
        help='Camera index to use. Default is 0. Assume only 1 camera connected.)')
    parser.add_argument('--resolution', required=False, default=0,
        help='Camera resolution to use. Default is 0. Options: 0-QVGA, 1-VGA, 2-HD, 3-FULLHD')
    return parser.parse_args(argv)


if __name__ == '__main__':
    main(parse_arguments(sys.argv[1:]))